Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[naverpost] add 'post' and 'user' extractors #4791

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/supportedsites.md
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Blogs, Posts</td>
<td></td>
</tr>
<tr>
<td>NaverPost</td>
<td>https://post.naver.com/</td>
<td>Posts, User Profiles</td>
<td></td>
</tr>
<tr>
<td>NaverWebtoon</td>
<td>https://comic.naver.com/</td>
Expand Down
1 change: 1 addition & 0 deletions gallery_dl/extractor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@
"myhentaigallery",
"myportfolio",
"naver",
"naverpost",
"naverwebtoon",
"newgrounds",
"nhentai",
Expand Down
143 changes: 143 additions & 0 deletions gallery_dl/extractor/naverpost.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://post.naver.com/"""

from .common import Extractor, Message
from .. import text, exception
import json
import re

BASE_PATTERN = r"(?:https?://)?(?:m\.)?post\.naver\.com"


class NaverpostExtractor(Extractor):
"""Base class for naver post extractors"""
category = "naverpost"
root = "https://post.naver.com"
request_interval = (0.5, 1.5)

def _call(self, url, params=None):
if params is None:
params = {}
while True:
try:
return self.request(url, params=params)
except exception.HttpError as exc:
if exc.status == 401:
raise exception.AuthenticationError()
if exc.status == 403:
raise exception.AuthorizationError()
if exc.status == 404:
raise exception.NotFoundError(self.subcategory)
self.log.debug(exc)
return

def _pagination(self, url, params=None):
if params is None:
params = {}
while True:
res = self._call(url, params).text
# the `html` string in the response contains escaped single quotes,
# which would throw a JSONDecodeError exception
res = json.loads(res.replace(r"\'", "'"))
urls = []
endpoints = text.extract_iter(
res["html"], '<div class="text_area">\n<a href="', '"')
for endpoint in endpoints:
urls.append(self.root + endpoint)
yield from urls
if "nextFromNo" not in res:
return
params["fromNo"] = res["nextFromNo"]


class NaverpostPostExtractor(NaverpostExtractor):
"""Extractor for posts on post.naver.com"""
subcategory = "post"
filename_fmt = "{image[id]}.{extension}"
directory_fmt = ("{category}", "{author}", "{volume_no}")
archive_fmt = "{image[id]}"
pattern = (BASE_PATTERN + r"/viewer/postView\.(naver|nhn)"
r"\?volumeNo=(\d+)(?:&.+)?")
example = "https://post.naver.com/viewer/postView.naver?volumeNo=12345"

def __init__(self, match):
NaverpostExtractor.__init__(self, match)
self.url = match.group(0)
self.page_ext = match.group(1)
self.volume_no = match.group(2)

def metadata(self, page):
data = {
"title": text.unescape(
text.extr(page, '"og:title" content="', '"')),
"description": text.unescape(
text.extr(page, '"og:description" content="', '"')),
"author": text.extr(page, '"og:author" content="', '"'),
"date": text.parse_datetime(
text.extr(page, '"og:createdate" content="', '"'),
format="%Y.%m.%d. %H:%M:%S", utcoffset=9),
"volume_no": self.volume_no,
"views": text.parse_int(
(text.extr(page, '<span class="post_view">', ' ') or
text.extr(page, '<span class="se_view" style="">', ' ')
).replace(",", "")),
"url": self.url,
}
return data

def items(self):
page = self._call(self.url).text
data = self.metadata(page)

yield Message.Directory, data

image_classes = ("img_attachedfile", "se_mediaImage")
image_query = r"\?type=w\d+$"
for image in text.extract_iter(page, "<img", ">"):
img = {
"id": text.extr(image, ' id="', '"'),
"title": text.extr(image, ' title="', '"'),
"attachment-id": text.extr(
image, ' data-attachment-id="', '"'),
"alt": None,
}
classes = text.extr(image, ' class="', '"').split()
if not any(item in classes for item in image_classes):
continue
url = text.extr(image, ' data-src="', '"')
if not re.search(image_query, url):
continue
url = re.sub(image_query, "", url)
img["url"] = url
alt = text.extr(image, ' alt="', '"')
if alt and alt.endswith(".jpg"):
img["alt"] = alt
data["filename"], _, data["extension"] = alt.rpartition(".")
else:
text.nameext_from_url(text.unquote(url), data)
data["image"] = img
yield Message.Url, url, data


class NaverpostUserExtractor(NaverpostExtractor):
"""Extractor for all posts from a user on post.naver.com"""
subcategory = "user"
pattern = BASE_PATTERN + r"/my\.naver\?memberNo=(\d+)"
example = "https://post.naver.com/my.naver?memberNo=12345"

def __init__(self, match):
NaverpostExtractor.__init__(self, match)
self.member_no = match.group(1)

def items(self):
data = {"_extractor": NaverpostPostExtractor}
endpoint = self.root + "/async/my.naver"
params = {"memberNo": self.member_no}
posts = self._pagination(endpoint, params)
for url in posts:
yield Message.Queue, url, data
1 change: 1 addition & 0 deletions scripts/supportedsites.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
"mastodon.social": "mastodon.social",
"myhentaigallery": "My Hentai Gallery",
"myportfolio" : "Adobe Portfolio",
"naverpost" : "NaverPost",
"naverwebtoon" : "NaverWebtoon",
"nhentai" : "nhentai",
"nijie" : "nijie",
Expand Down
53 changes: 53 additions & 0 deletions test/results/naverpost.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

from gallery_dl.extractor import naverpost

IMAGE_URL_PATTERN = r"(?i)https://post-phinf\.pstatic\.net/.*\.(?:gif|jpe?g|png|webp)"


__tests__ = (
{
"#url": "https://m.post.naver.com/viewer/postView.nhn?volumeNo=15861102&memberNo=16220685",
"#comment": ".nhn page extension",
"#category": ("", "naverpost", "post"),
"#class": naverpost.NaverpostPostExtractor,
"#pattern": IMAGE_URL_PATTERN,
"#count": 34,

"title": "[쇼! 음악중심] 180526 방탄소년단 FAKE LOVE 현장 포토",
"description": "[BY MBC예능연구소] [쇼! 음악중심] 589회, 20180526 ※본 콘텐츠는 상업적 용도의 사용을 금합니다.",
"author": "MBC예능연구소",
"date": "dt:2018-05-29 12:09:34",
"views": int,
},

{
"#url": "https://post.naver.com/viewer/postView.naver?volumeNo=31389956&memberNo=29156514",
"#comment": ".naver page extension",
"#category": ("", "naverpost", "post"),
"#class": naverpost.NaverpostPostExtractor,
"#pattern": IMAGE_URL_PATTERN,
"#count": 48,

"title": "매일 밤 꿈꿔 왔던 드림캐쳐 '바람아' 활동 비하인드 현장",
"description": "[BY 드림캐쳐컴퍼니] 안녕하세요.드림캐쳐 포스트 지기입니다!(*・▽・*)'Odd Eye' 활동이 끝나고 아쉬웠을...",
"author": "드림캐쳐컴퍼니",
"date": "dt:2021-05-03 06:00:09",
"views": int,
},

{
"#url": "https://post.naver.com/my.naver?memberNo=29156514",
"#comment": "up to 20 posts are returned per request",
"#category": ("", "naverpost", "user"),
"#class": naverpost.NaverpostUserExtractor,
"#pattern": naverpost.NaverpostPostExtractor.pattern,
"#range": "1-21",
"#count": 21,
},

)
Loading