Skip to content

Commit

Permalink
[weverse] add extractors
Browse files Browse the repository at this point in the history
  • Loading branch information
bradenhilton committed Nov 6, 2023
1 parent 807ddde commit cbb6bc7
Show file tree
Hide file tree
Showing 3 changed files with 370 additions and 0 deletions.
20 changes: 20 additions & 0 deletions docs/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3599,6 +3599,26 @@ Description
Download video files.


extractor.weverse.access-token
------------------------------
Type
``string``
Default
``null``
Description
Your Weverse account access token.

The token can be found in the ``we2_access_token`` cookie in the
``.weverse.io`` cookie domain after logging in to your account.

An invalid or not up-to-date value
will result in ``401 Unauthorized`` errors.

If this option is unset, and the cookie is not used, an extra HTTP
request will be sent with your ``username`` and ``password`` to
attempt to fetch a new token.


extractor.ytdl.enabled
----------------------
Type
Expand Down
1 change: 1 addition & 0 deletions gallery_dl/extractor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@
"webmshare",
"webtoons",
"weibo",
"weverse",
"wikiart",
"wikifeet",
"xhamster",
Expand Down
349 changes: 349 additions & 0 deletions gallery_dl/extractor/weverse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,349 @@
# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://weverse.io/"""

from .common import Extractor, Message
from .. import text, exception
from ..cache import cache
import binascii
import hashlib
import hmac
import time
import urllib.parse
import uuid
from collections import OrderedDict

BASE_PATTERN = r"(?:https?://)?(?:m\.)?weverse\.io"
COMMUNITY_PATTERN = BASE_PATTERN + r"/(\w+)"

MEMBER_ID_PATTERN = r"/([a-f0-9]+)"
POST_ID_PATTERN = r"/(\d-\d+)"


class WeverseExtractor(Extractor):
"""Base class for weverse extractors"""
category = "weverse"
cookies_domain = ".weverse.io"
cookies_names = ("we2_access_token",)
root = "https://weverse.io"
filename_fmt = "{filename}.{extension}"
request_interval = 1.0

def _init(self):
self.login()
if self.access_token:
self.api = WeverseAPI(self, self.access_token)

def login(self):
if self.config("access-token"):
self.access_token = self.config("access-token")
return

if not self.cookies_check(self.cookies_names):
username, password = self._get_auth_info()
if username:
self.cookies_update(
self._login_impl(username, password), self.cookies_domain)

self.access_token = self.cookies.get(self.cookies_names[0])

@cache(maxage=365*24*3600, keyarg=1)
def _login_impl(self, username, password):
endpoint = ("https://accountapi.weverse.io"
"/web/api/v2/auth/token/by-credentials")
data = {"email": username, "password": password}
headers = {
"x-acc-app-secret": "5419526f1c624b38b10787e5c10b2a7a",
"x-acc-app-version": "2.2.20-alpha.0",
"x-acc-language": "en",
"x-acc-service-id": "weverse",
"x-acc-trace-id": str(uuid.uuid4())
}
res = self.request(
endpoint, method="POST", data=data, headers=headers).json()
if "accessToken" not in res:
raise exception.AuthenticationError()
return {self.cookies_names[0]: res["accessToken"]}

def metadata(self, data):
if "date" not in data and "publishedAt" in data:
data["date"] = text.parse_timestamp(data["publishedAt"] / 1000)

if "author_name" not in data and "author" in data:
author = data["author"]
data["author_name"] = author.get("artistOfficialProfile", {}).get(
"officialName") or author["profileName"]

def has_media(self, data):
for key in ("extension", "attachment", "photo", "video"):
if key in data and data[key] != {}:
return True
return False


class WeversePostExtractor(WeverseExtractor):
"""Extractor for weverse posts"""
subcategory = "post"
directory_fmt = ("{category}", "{community[communityName]}",
"{author_name}", "{postId}")
archive_fmt = "{postId}"
pattern = COMMUNITY_PATTERN + r"/(?:artist|fanpost)" + POST_ID_PATTERN
example = "https://weverse.io/abcdef/artist/1-123456789"

def __init__(self, match):
WeverseExtractor.__init__(self, match)
self.post_id = match.group(2)

def items(self):
data = self.api.post(self.post_id)

# skip posts with no media
if not self.has_media(data):
self.log.debug("Skipping %s (no media)", self.url)
return

self.metadata(data)

attachments = data["attachment"]
del data["attachment"]

yield Message.Directory, data
for attachment_type, attachment_data in attachments.items():
for attachment in attachment_data.values():
url = ""
file_id = ""

if attachment_type == "photo":
url = attachment["url"]
file_id = attachment["photoId"]
if attachment_type == "video":
file_id = attachment["videoId"]
best_video = self.api.video(file_id)
url = best_video["url"]

data["filename"] = self.category + "_" + file_id
data["extension"] = text.ext_from_url(url)
yield Message.Url, url, data


class WeverseProfileExtractor(WeverseExtractor):
"""Extractor for weverse community profiles"""
subcategory = "profile"
pattern = COMMUNITY_PATTERN + "/profile" + MEMBER_ID_PATTERN
example = ("https://weverse.io/abcdef"
"/profile/a0b1c2d3e4f5a6b7c8d9e0f1a2b3c4d5")

def __init__(self, match):
WeverseExtractor.__init__(self, match)
self.member_id = match.group(2)

def items(self):
data = {"_extractor": WeversePostExtractor}
posts = self.api.profile(self.member_id)
for post in posts:
yield Message.Queue, post["shareUrl"], data


class WeverseArtistTabExtractor(WeverseExtractor):
"""Extractor for all artists in a weverse community"""
subcategory = "artist-tab"
pattern = COMMUNITY_PATTERN + "/artist$"
example = "https://weverse.io/abcdef/artist"

def __init__(self, match):
WeverseExtractor.__init__(self, match)
self.community_keyword = match.group(1)

def items(self):
data = {"_extractor": WeversePostExtractor}
posts = self.api.artist_tab(self.community_keyword)
for post in posts:
yield Message.Queue, post["shareUrl"], data


class WeverseMomentExtractor(WeverseExtractor):
"""Extractor for moments from a weverse community artist"""
subcategory = "moment"
directory_fmt = ("{category}", "{community[communityName]}",
"{author_name}", "{postId}")
archive_fmt = "{postId}"
pattern = (COMMUNITY_PATTERN +
"/moment" + MEMBER_ID_PATTERN +
"/post" + POST_ID_PATTERN)
example = ("https://weverse.io/abcdef"
"/moment/a0b1c2d3e4f5a6b7c8d9e0f1a2b3c4d5"
"/post/1-123456789")

def __init__(self, match):
WeverseExtractor.__init__(self, match)
self.post_id = match.group(3)

def items(self):
data = self.api.post(self.post_id)

moment = {}
if "moment" in data["extension"]:
moment = data["extension"]["moment"]
elif "momentW1" in data["extension"]:
moment = data["extension"]["momentW1"]

# skip moments with no media
if not self.has_media(moment):
self.log.debug("Skipping %s (no media)", self.url)
return

self.metadata(data)

del data["extension"]
del data["authorMomentPosts"]

yield Message.Directory, data
url = ""
file_id = ""

if "photo" in moment:
url = moment["photo"]["url"]
file_id = moment["photo"]["photoId"]
if "video" in moment:
file_id = moment["video"]["videoId"]
best_video = self.api.video(file_id)
url = best_video["url"]

data["filename"] = self.category + "_" + file_id
data["extension"] = text.ext_from_url(url)
yield Message.Url, url, data


class WeverseMomentsExtractor(WeverseExtractor):
"""Extractor for all moments from a weverse community artist"""
subcategory = "moments"
pattern = COMMUNITY_PATTERN + "/moment" + MEMBER_ID_PATTERN + "$"
example = ("https://weverse.io/abcdef"
"/moment/a0b1c2d3e4f5a6b7c8d9e0f1a2b3c4d5")

def __init__(self, match):
WeverseExtractor.__init__(self, match)
self.member_id = match.group(2)

def items(self):
data = {"_extractor": WeverseMomentExtractor}
moments = self.api.moments(self.member_id)
for moment in moments:
yield Message.Queue, moment["shareUrl"], data


class WeverseAPI():
"""Interface for the Weverse API"""
BASE_API_URL = "https://global.apis.naver.com"

def __init__(self, extractor, access_token):
self.extractor = extractor
self.headers = {"Authorization": "Bearer " + access_token}

def _endpoint_with_params(self, endpoint, params):
params_delimiter = "?"
if "?" in endpoint:
params_delimiter = "&"
return endpoint + params_delimiter + urllib.parse.urlencode(
query=params)

def _message_digest(self, endpoint, params, timestamp):
key = "1b9cb6378d959b45714bec49971ade22e6e24e42".encode()
url = self._endpoint_with_params(endpoint, params)
message = "{}{}".format(url[:255], timestamp).encode()
hash = hmac.new(key, message, hashlib.sha1).digest()
return binascii.b2a_base64(hash).rstrip().decode()

def community_id(self, community_keyword):
endpoint = "/community/v1.0/communityIdUrlPathByUrlPathArtistCode"
params = {"keyword": community_keyword}
return self._call(endpoint, params)["communityId"]

def post(self, post_id):
endpoint = "/post/v1.0/post-{}".format(post_id)
params = {"fieldSet": "postV1"}
return self._call(endpoint, params)

def video(self, video_id):
endpoint = "/cvideo/v1.0/cvideo-{}/downloadInfo".format(video_id)
videos = self._call(endpoint)["downloadInfo"]
best_video = max(videos, key=lambda video:
text.parse_int(video["resolution"].rstrip("P")))
return best_video

def profile(self, member_id):
endpoint = "/post/v1.0/member-{}/posts".format(member_id)
params = {
"fieldSet": "postsV1",
"filterType": "DEFAULT",
"limit": 20,
"sortType": "LATEST"
}
yield from self._pagination(endpoint, params)

def artist_tab(self, community_keyword):
community_id = self.community_id(community_keyword)
endpoint = "/post/v1.0/community-{}/artistTabPosts".format(
community_id)
params = {
"fieldSet": "postsV1",
"limit": 20,
"pagingType": "CURSOR"
}
yield from self._pagination(endpoint, params)

def moments(self, member_id):
endpoint = "/post/v1.0/member-{}/posts".format(member_id)
params = {
"fieldSet": "postsV1",
"filterType": "MOMENT",
"limit": 1
}
yield from self._pagination(endpoint, params)

def _call(self, endpoint, params=None):
if params is None:
params = {}
params.update({
"appId": "be4d79eb8fc7bd008ee82c8ec4ff6fd4",
"language": "en",
"platform": "WEB",
"wpf": "pc",
})
params = OrderedDict(sorted(params.items()))
timestamp = int(time.time() * 1000)
message_digest = self._message_digest(endpoint, params, timestamp)
params.update({
"wmsgpad": timestamp,
"wmd": message_digest
})
while True:
try:
return self.extractor.request(
self.BASE_API_URL + "/weverse/wevweb" + endpoint,
params=params, headers=self.headers,
).json()
except exception.HttpError as exc:
if exc.status == 401:
raise exception.AuthenticationError()
if exc.status == 403:
raise exception.AuthorizationError(
"Post requires membership")
if exc.status == 404:
raise exception.NotFoundError(self.extractor.subcategory)
self.extractor.log.debug(exc)
return

def _pagination(self, endpoint, params=None):
if params is None:
params = {}
while True:
res = self._call(endpoint, params)
yield from res["data"]
if "nextParams" not in res["paging"]:
return
params["after"] = res["paging"]["nextParams"]["after"]

0 comments on commit cbb6bc7

Please sign in to comment.