[pornhub] add 'gif' support (#4463)

mikf · Aug 29, 2023 · a783c4f · a783c4f
1 parent ba84298
commit a783c4f
Show file tree

Hide file tree

Showing 3 changed files with 133 additions and 30 deletions.
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
@@ -694,7 +694,7 @@ Consider all sites to be NSFW unless otherwise known.
 <tr>
     <td>Pornhub</td>
     <td>https://www.pornhub.com/</td>
-    <td>Galleries, User Profiles</td>
+    <td>Galleries, Gifs, Photos, User Profiles</td>
     <td></td>
 </tr>
 <tr>

diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py
@@ -19,6 +19,35 @@ class PornhubExtractor(Extractor):
     category = "pornhub"
     root = "https://www.pornhub.com"
 
+    def _init(self):
+        self.cookies.set(
+            "accessAgeDisclaimerPH", "1", domain=".pornhub.com")
+
+    def _pagination(self, user, path):
+        if "/" not in path:
+            path += "/public"
+
+        url = "{}/{}/{}/ajax".format(self.root, user, path)
+        params = {"page": 1}
+        headers = {
+            "Referer": url[:-5],
+            "X-Requested-With": "XMLHttpRequest",
+        }
+
+        while True:
+            response = self.request(
+                url, method="POST", headers=headers, params=params,
+                allow_redirects=False)
+
+            if 300 <= response.status_code < 400:
+                url = "{}{}/{}/ajax".format(
+                    self.root, response.headers["location"], path)
+                continue
+
+            yield response.text
+
+            params["page"] += 1
+
 
 class PornhubGalleryExtractor(PornhubExtractor):
     """Extractor for image galleries on pornhub.com"""
@@ -58,9 +87,6 @@ def __init__(self, match):
         self._first = None
 
     def items(self):
-        self.cookies.set(
-            "accessAgeDisclaimerPH", "1", domain=".pornhub.com")
-
         data = self.metadata()
         yield Message.Directory, data
         for num, image in enumerate(self.images(), 1):
@@ -116,17 +142,83 @@ def images(self):
                 return
 
 
+class PornhubGifExtractor(PornhubExtractor):
+    """Extractor for pornhub.com gifs"""
+    subcategory = "gif"
+    directory_fmt = ("{category}", "{user}", "gifs")
+    filename_fmt = "{id} {title}.{extension}"
+    archive_fmt = "{id}"
+    pattern = BASE_PATTERN + r"/gif/(\d+)"
+    test = (
+        ("https://www.pornhub.com/gif/33643461", {
+            "pattern": r"https://\w+\.phncdn\.com/pics/gifs"
+                       r"/033/643/461/33643461a\.webm",
+            "keyword": {
+                "date": "dt:2020-10-31 00:00:00",
+                "extension": "webm",
+                "filename": "33643461a",
+                "id": "33643461",
+                "tags": ["big boobs", "lana rhoades"],
+                "title": "Big boobs",
+                "url": str,
+                "user": "Lana Rhoades",
+            },
+        }),
+    )
+
+    def __init__(self, match):
+        PornhubExtractor.__init__(self, match)
+        self.gallery_id = match.group(1)
+
+    def items(self):
+        url = "{}/gif/{}".format(self.root, self.gallery_id)
+        extr = text.extract_from(self.request(url).text)
+
+        gif = {
+            "id"   : self.gallery_id,
+            "tags" : extr("data-context-tag='", "'").split(","),
+            "title": extr('"name": "', '"'),
+            "url"  : extr('"contentUrl": "', '"'),
+            "date" : text.parse_datetime(
+                extr('"uploadDate": "', '"'), "%Y-%m-%d"),
+            "user" : extr('data-mxptext="', '"'),
+        }
+
+        yield Message.Directory, gif
+        yield Message.Url, gif["url"], text.nameext_from_url(gif["url"], gif)
+
+
 class PornhubUserExtractor(PornhubExtractor):
-    """Extractor for all galleries of a pornhub user"""
+    """Extractor for a pornhub user"""
     subcategory = "user"
-    pattern = (BASE_PATTERN + r"/(users|model|pornstar)/([^/?#]+)"
-               "(?:/photos(?:/(public|private|favorites))?)?/?$")
+    pattern = BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)/?$"
+    test = ("https://www.pornhub.com/pornstar/danika-mori",)
+
+    def __init__(self, match):
+        PornhubExtractor.__init__(self, match)
+        self.user = match.group(1)
+
+    def initialize(self):
+        pass
+
+    def items(self):
+        base = "{}/{}/".format(self.root, self.user)
+        return self._dispatch_extractors((
+            (PornhubPhotosExtractor, base + "photos"),
+            (PornhubGifsExtractor  , base + "gifs"),
+        ), ("photos",))
+
+
+class PornhubPhotosExtractor(PornhubExtractor):
+    """Extractor for all galleries of a pornhub user"""
+    subcategory = "photos"
+    pattern = (BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)"
+               "/(photos(?:/[^/?#]+)?)")
     test = (
         ("https://www.pornhub.com/pornstar/danika-mori/photos", {
             "pattern": PornhubGalleryExtractor.pattern,
             "count": ">= 6",
         }),
-        ("https://www.pornhub.com/users/flyings0l0/"),
         ("https://www.pornhub.com/users/flyings0l0/photos/public"),
         ("https://www.pornhub.com/users/flyings0l0/photos/private"),
         ("https://www.pornhub.com/users/flyings0l0/photos/favorites"),
@@ -135,33 +227,41 @@ class PornhubUserExtractor(PornhubExtractor):
 
     def __init__(self, match):
         PornhubExtractor.__init__(self, match)
-        self.type, self.user, self.cat = match.groups()
+        self.user, self.path = match.groups()
 
     def items(self):
-        url = "{}/{}/{}/photos/{}/ajax".format(
-            self.root, self.type, self.user, self.cat or "public")
-        params = {"page": 1}
-        headers = {
-            "Referer": url[:-5],
-            "X-Requested-With": "XMLHttpRequest",
-        }
-
         data = {"_extractor": PornhubGalleryExtractor}
-        while True:
-            response = self.request(
-                url, method="POST", headers=headers, params=params,
-                allow_redirects=False)
-
-            if 300 <= response.status_code < 400:
-                url = "{}{}/photos/{}/ajax".format(
-                    self.root, response.headers["location"],
-                    self.cat or "public")
-                continue
-
+        for page in self._pagination(self.user, self.path):
             gid = None
-            for gid in text.extract_iter(response.text, 'id="albumphoto', '"'):
+            for gid in text.extract_iter(page, 'id="albumphoto', '"'):
                 yield Message.Queue, self.root + "/album/" + gid, data
             if gid is None:
                 return
 
-            params["page"] += 1
+
+class PornhubGifsExtractor(PornhubExtractor):
+    """Extractor for a pornhub user's gifs"""
+    subcategory = "gifs"
+    pattern = (BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)"
+               "/(gifs(?:/[^/?#]+)?)")
+    test = (
+        ("https://www.pornhub.com/pornstar/danika-mori/gifs", {
+            "pattern": PornhubGifExtractor.pattern,
+            "count": ">= 42",
+        }),
+        ("https://www.pornhub.com/users/flyings0l0/gifs"),
+        ("https://www.pornhub.com/model/bossgirl/gifs/video"),
+    )
+
+    def __init__(self, match):
+        PornhubExtractor.__init__(self, match)
+        self.user, self.path = match.groups()
+
+    def items(self):
+        data = {"_extractor": PornhubGifExtractor}
+        for page in self._pagination(self.user, self.path):
+            gid = None
+            for gid in text.extract_iter(page, 'id="gif', '"'):
+                yield Message.Queue, self.root + "/gif/" + gid, data
+            if gid is None:
+                return
diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py
@@ -224,6 +224,9 @@
         "sketch": "Sketch",
         "work": "individual Images",
     },
+    "pornhub": {
+        "gifs": "",
+    },
     "reddit": {
         "home": "Home Feed",
     },