]> jfr.im git - yt-dlp.git/commitdiff
Update to ytdl-commit-195f22f6
authorSimon Sawicki <redacted>
Mon, 2 Jan 2023 13:45:36 +0000 (14:45 +0100)
committerGitHub <redacted>
Mon, 2 Jan 2023 13:45:36 +0000 (19:15 +0530)
[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan

yt_dlp/extractor/_extractors.py
yt_dlp/extractor/common.py
yt_dlp/extractor/generic.py
yt_dlp/extractor/peekvids.py
yt_dlp/extractor/thisvid.py [new file with mode: 0644]

index 352de83cac0a3a40a06aface65df28ae0aac2605..83e732189c4b3510c95cb49f0da8d411d02cd828 100644 (file)
 from .thisamericanlife import ThisAmericanLifeIE
 from .thisav import ThisAVIE
 from .thisoldhouse import ThisOldHouseIE
+from .thisvid import (
+    ThisVidIE,
+    ThisVidMemberIE,
+    ThisVidPlaylistIE,
+)
 from .threespeak import (
     ThreeSpeakIE,
     ThreeSpeakUserIE,
index f48b97a6b6d75ecdee3672398d2ce86bd0cce3f6..21d5c39fdb59a08b66a6f4e8797203c7ca7e69ac 100644 (file)
@@ -1396,10 +1396,16 @@ def _rta_search(html):
         # And then there are the jokers who advertise that they use RTA, but actually don't.
         AGE_LIMIT_MARKERS = [
             r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
+            r'>[^<]*you acknowledge you are at least (\d+) years old',
+            r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:§+\s*)?2257\b',
         ]
-        if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
-            return 18
-        return 0
+
+        age_limit = 0
+        for marker in AGE_LIMIT_MARKERS:
+            mobj = re.search(marker, html)
+            if mobj:
+                age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
+        return age_limit
 
     def _media_rating_search(self, html):
         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
@@ -3216,7 +3222,7 @@ def manifest_url(manifest):
 
     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
         mobj = re.search(
-            r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
+            r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
             webpage)
         if mobj:
             try:
@@ -3237,19 +3243,20 @@ def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
 
     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
-        # JWPlayer backward compatibility: flattened playlists
-        # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
-        if 'playlist' not in jwplayer_data:
-            jwplayer_data = {'playlist': [jwplayer_data]}
-
         entries = []
+        if not isinstance(jwplayer_data, dict):
+            return entries
 
-        # JWPlayer backward compatibility: single playlist item
+        playlist_items = jwplayer_data.get('playlist')
+        # JWPlayer backward compatibility: single playlist item/flattened playlists
         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
-        if not isinstance(jwplayer_data['playlist'], list):
-            jwplayer_data['playlist'] = [jwplayer_data['playlist']]
+        # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
+        if not isinstance(playlist_items, list):
+            playlist_items = (playlist_items or jwplayer_data, )
 
-        for video_data in jwplayer_data['playlist']:
+        for video_data in playlist_items:
+            if not isinstance(video_data, dict):
+                continue
             # JWPlayer backward compatibility: flattened sources
             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
             if 'sources' not in video_data:
@@ -3287,6 +3294,13 @@ def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
                 'timestamp': int_or_none(video_data.get('pubdate')),
                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
                 'subtitles': subtitles,
+                'alt_title': clean_html(video_data.get('subtitle')),  # attributes used e.g. by Tele5 ...
+                'genre': clean_html(video_data.get('genre')),
+                'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
+                'season_number': int_or_none(video_data.get('season')),
+                'episode_number': int_or_none(video_data.get('episode')),
+                'release_year': int_or_none(video_data.get('releasedate')),
+                'age_limit': int_or_none(video_data.get('age_restriction')),
             }
             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
@@ -3304,7 +3318,7 @@ def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
 
     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
-        urls = []
+        urls = set()
         formats = []
         for source in jwplayer_sources_data:
             if not isinstance(source, dict):
@@ -3313,14 +3327,14 @@ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
                 base_url, self._proto_relative_url(source.get('file')))
             if not source_url or source_url in urls:
                 continue
-            urls.append(source_url)
+            urls.add(source_url)
             source_type = source.get('type') or ''
             ext = mimetype2ext(source_type) or determine_ext(source_url)
-            if source_type == 'hls' or ext == 'm3u8':
+            if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
                 formats.extend(self._extract_m3u8_formats(
                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
                     m3u8_id=m3u8_id, fatal=False))
-            elif source_type == 'dash' or ext == 'mpd':
+            elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
                 formats.extend(self._extract_mpd_formats(
                     source_url, video_id, mpd_id=mpd_id, fatal=False))
             elif ext == 'smil':
@@ -3335,13 +3349,12 @@ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
                     'ext': ext,
                 })
             else:
+                format_id = str_or_none(source.get('label'))
                 height = int_or_none(source.get('height'))
-                if height is None:
+                if height is None and format_id:
                     # Often no height is provided but there is a label in
                     # format like "1080p", "720p SD", or 1080.
-                    height = int_or_none(self._search_regex(
-                        r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
-                        'height', default=None))
+                    height = parse_resolution(format_id).get('height')
                 a_format = {
                     'url': source_url,
                     'width': int_or_none(source.get('width')),
@@ -3349,6 +3362,7 @@ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
                     'tbr': int_or_none(source.get('bitrate'), scale=1000),
                     'filesize': int_or_none(source.get('filesize')),
                     'ext': ext,
+                    'format_id': format_id
                 }
                 if source_url.startswith('rtmp'):
                     a_format['ext'] = 'flv'
index ffc2790230e63da75f2a18a397f7b74a0fbcfdc7..14d492f075580ba6f93710029144cd7fac24cdbb 100644 (file)
@@ -32,6 +32,7 @@
     unified_timestamp,
     unsmuggle_url,
     url_or_none,
+    urljoin,
     variadic,
     xpath_attr,
     xpath_text,
@@ -1867,11 +1868,13 @@ class GenericIE(InfoExtractor):
                 'display_id': 'kelis-4th-of-july',
                 'ext': 'mp4',
                 'title': 'Kelis - 4th Of July',
-                'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
+                'description': 'Kelis - 4th Of July',
+                'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
             },
             'params': {
                 'skip_download': True,
             },
+            'expected_warnings': ['Untested major version'],
         }, {
             # KVS Player
             'url': 'https://www.kvs-demo.com/embed/105/',
@@ -1880,35 +1883,12 @@ class GenericIE(InfoExtractor):
                 'display_id': 'kelis-4th-of-july',
                 'ext': 'mp4',
                 'title': 'Kelis - 4th Of July / Embed Player',
-                'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
+                'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
             },
             'params': {
                 'skip_download': True,
             },
         }, {
-            # KVS Player
-            'url': 'https://thisvid.com/videos/french-boy-pantsed/',
-            'md5': '3397979512c682f6b85b3b04989df224',
-            'info_dict': {
-                'id': '2400174',
-                'display_id': 'french-boy-pantsed',
-                'ext': 'mp4',
-                'title': 'French Boy Pantsed - ThisVid.com',
-                'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg',
-            }
-        }, {
-            # KVS Player
-            'url': 'https://thisvid.com/embed/2400174/',
-            'md5': '3397979512c682f6b85b3b04989df224',
-            'info_dict': {
-                'id': '2400174',
-                'display_id': 'french-boy-pantsed',
-                'ext': 'mp4',
-                'title': 'French Boy Pantsed - ThisVid.com',
-                'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg',
-            }
-        }, {
-            # KVS Player
             'url': 'https://youix.com/video/leningrad-zoj/',
             'md5': '94f96ba95706dc3880812b27b7d8a2b8',
             'info_dict': {
@@ -1916,8 +1896,8 @@ class GenericIE(InfoExtractor):
                 'display_id': 'leningrad-zoj',
                 'ext': 'mp4',
                 'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com',
-                'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg',
-            }
+                'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg',
+            },
         }, {
             # KVS Player
             'url': 'https://youix.com/embed/18485',
@@ -1927,19 +1907,20 @@ class GenericIE(InfoExtractor):
                 'display_id': 'leningrad-zoj',
                 'ext': 'mp4',
                 'title': 'Ленинград - ЗОЖ',
-                'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg',
-            }
+                'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg',
+            },
         }, {
             # KVS Player
             'url': 'https://bogmedia.org/videos/21217/40-nochey-40-nights-2016/',
             'md5': '94166bdb26b4cb1fb9214319a629fc51',
             'info_dict': {
                 'id': '21217',
-                'display_id': '40-nochey-40-nights-2016',
+                'display_id': '40-nochey-2016',
                 'ext': 'mp4',
                 'title': '40 ночей (2016) - BogMedia.org',
+                'description': 'md5:4e6d7d622636eb7948275432eb256dc3',
                 'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg',
-            }
+            },
         },
         {
             # KVS Player (for sites that serve kt_player.js via non-https urls)
@@ -1949,9 +1930,9 @@ class GenericIE(InfoExtractor):
                 'id': '389508',
                 'display_id': 'syren-de-mer-onlyfans-05-07-2020have-a-happy-safe-holiday5f014e68a220979bdb8cd-source',
                 'ext': 'mp4',
-                'title': 'Syren De Mer  onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер',
-                'thumbnail': 'http://www.camhub.world/contents/videos_screenshots/389000/389508/preview.mp4.jpg',
-            }
+                'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер',
+                'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg',
+            },
         },
         {
             # Reddit-hosted video that will redirect and be processed by RedditIE
@@ -2169,7 +2150,20 @@ class GenericIE(InfoExtractor):
                 'direct': True,
                 'age_limit': 0,
             }
-        }
+        },
+        {
+            'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/',
+            'md5': 'e2f0a4c329f7986280b7328e24036d60',
+            'info_dict': {
+                'id': '284002',
+                'display_id': 'just-out-of-the-shower-joi',
+                'ext': 'mp4',
+                'title': 'Just Out Of The Shower JOI - Shooshtime',
+                'thumbnail': 'https://i.shoosh.co/contents/videos_screenshots/284000/284002/preview.mp4.jpg',
+                'height': 720,
+                'age_limit': 18,
+            },
+        },
     ]
 
     def report_following_redirect(self, new_url):
@@ -2235,43 +2229,87 @@ def itunes(key):
             'entries': entries,
         }
 
-    def _kvs_getrealurl(self, video_url, license_code):
+    @classmethod
+    def _kvs_get_real_url(cls, video_url, license_code):
         if not video_url.startswith('function/0/'):
             return video_url  # not obfuscated
 
-        url_path, _, url_query = video_url.partition('?')
-        urlparts = url_path.split('/')[2:]
-        license = self._kvs_getlicensetoken(license_code)
-        newmagic = urlparts[5][:32]
+        parsed = urllib.parse.urlparse(video_url[len('function/0/'):])
+        license = cls._kvs_get_license_token(license_code)
+        urlparts = parsed.path.split('/')
 
-        for o in range(len(newmagic) - 1, -1, -1):
-            new = ''
-            l = (o + sum(int(n) for n in license[o:])) % 32
+        HASH_LENGTH = 32
+        hash = urlparts[3][:HASH_LENGTH]
+        indices = list(range(HASH_LENGTH))
 
-            for i in range(0, len(newmagic)):
-                if i == o:
-                    new += newmagic[l]
-                elif i == l:
-                    new += newmagic[o]
-                else:
-                    new += newmagic[i]
-            newmagic = new
+        # Swap indices of hash according to the destination calculated from the license token
+        accum = 0
+        for src in reversed(range(HASH_LENGTH)):
+            accum += license[src]
+            dest = (src + accum) % HASH_LENGTH
+            indices[src], indices[dest] = indices[dest], indices[src]
+
+        urlparts[3] = ''.join(hash[index] for index in indices) + urlparts[3][HASH_LENGTH:]
+        return urllib.parse.urlunparse(parsed._replace(path='/'.join(urlparts)))
 
-        urlparts[5] = newmagic + urlparts[5][32:]
-        return '/'.join(urlparts) + '?' + url_query
+    @staticmethod
+    def _kvs_get_license_token(license):
+        license = license.replace('$', '')
+        license_values = [int(char) for char in license]
 
-    def _kvs_getlicensetoken(self, license):
-        modlicense = license.replace('$', '').replace('0', '1')
-        center = int(len(modlicense) / 2)
+        modlicense = license.replace('0', '1')
+        center = len(modlicense) // 2
         fronthalf = int(modlicense[:center + 1])
         backhalf = int(modlicense[center:])
+        modlicense = str(4 * abs(fronthalf - backhalf))[:center + 1]
+
+        return [
+            (license_values[index + offset] + current) % 10
+            for index, current in enumerate(map(int, modlicense))
+            for offset in range(4)
+        ]
+
+    def _extract_kvs(self, url, webpage, video_id):
+        flashvars = self._search_json(
+            r'(?s:<script\b[^>]*>.*?var\s+flashvars\s*=)',
+            webpage, 'flashvars', video_id, transform_source=js_to_json)
+
+        # extract the part after the last / as the display_id from the
+        # canonical URL.
+        display_id = self._search_regex(
+            r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>'
+            r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)',
+            webpage, 'display_id', fatal=False)
+        title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')
+
+        thumbnail = flashvars['preview_url']
+        if thumbnail.startswith('//'):
+            protocol, _, _ = url.partition('/')
+            thumbnail = protocol + thumbnail
+
+        url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys()))
+        formats = []
+        for key in url_keys:
+            if '/get_file/' not in flashvars[key]:
+                continue
+            format_id = flashvars.get(f'{key}_text', key)
+            formats.append({
+                'url': urljoin(url, self._kvs_get_real_url(flashvars[key], flashvars['license_code'])),
+                'format_id': format_id,
+                'ext': 'mp4',
+                **(parse_resolution(format_id) or parse_resolution(flashvars[key])),
+                'http_headers': {'Referer': url},
+            })
+            if not formats[-1].get('height'):
+                formats[-1]['quality'] = 1
 
-        modlicense = str(4 * abs(fronthalf - backhalf))
-        retval = ''
-        for o in range(0, center + 1):
-            for i in range(1, 5):
-                retval += str((int(license[o + i]) + int(modlicense[o])) % 10)
-        return retval
+        return {
+            'id': flashvars['video_id'],
+            'display_id': display_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
 
     def _real_extract(self, url):
         if url.startswith('//'):
@@ -2580,6 +2618,17 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
                 self.report_detected('video.js embed')
                 return [{'formats': formats, 'subtitles': subtitles}]
 
+        # Look for generic KVS player (before json-ld bc of some urls that break otherwise)
+        found = self._search_regex((
+            r'<script\b[^>]+?\bsrc\s*=\s*(["\'])https?://(?:\S+?/)+kt_player\.js\?v=(?P<ver>\d+(?:\.\d+)+)\1[^>]*>',
+            r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:\S+?/)+kt_player\.swf\?v=(?P<ver>\d+(?:\.\d+)+)\2\s*,',
+        ), webpage, 'KVS player', group='ver', default=False)
+        if found:
+            self.report_detected('KWS Player')
+            if found.split('.')[0] not in ('4', '5', '6'):
+                self.report_warning(f'Untested major version ({found}) in player engine - download may fail.')
+            return [self._extract_kvs(url, webpage, video_id)]
+
         # Looking for http://schema.org/VideoObject
         json_ld = self._search_json_ld(webpage, video_id, default={})
         if json_ld.get('url') not in (url, None):
@@ -2622,52 +2671,6 @@ def filter_video(urls):
                 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
             if found:
                 self.report_detected('JW Player embed')
-        if not found:
-            # Look for generic KVS player
-            found = re.search(r'<script [^>]*?src="https?://.+?/kt_player\.js\?v=(?P<ver>(?P<maj_ver>\d+)(\.\d+)+)".*?>', webpage)
-            if found:
-                self.report_detected('KWS Player')
-                if found.group('maj_ver') not in ['4', '5']:
-                    self.report_warning('Untested major version (%s) in player engine--Download may fail.' % found.group('ver'))
-                flashvars = re.search(r'(?ms)<script.*?>.*?var\s+flashvars\s*=\s*(\{.*?\});.*?</script>', webpage)
-                flashvars = self._parse_json(flashvars.group(1), video_id, transform_source=js_to_json)
-
-                # extract the part after the last / as the display_id from the
-                # canonical URL.
-                display_id = self._search_regex(
-                    r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>'
-                    r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)',
-                    webpage, 'display_id', fatal=False
-                )
-                title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')
-
-                thumbnail = flashvars['preview_url']
-                if thumbnail.startswith('//'):
-                    protocol, _, _ = url.partition('/')
-                    thumbnail = protocol + thumbnail
-
-                url_keys = list(filter(re.compile(r'video_url|video_alt_url\d*').fullmatch, flashvars.keys()))
-                formats = []
-                for key in url_keys:
-                    if '/get_file/' not in flashvars[key]:
-                        continue
-                    format_id = flashvars.get(f'{key}_text', key)
-                    formats.append({
-                        'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']),
-                        'format_id': format_id,
-                        'ext': 'mp4',
-                        **(parse_resolution(format_id) or parse_resolution(flashvars[key]))
-                    })
-                    if not formats[-1].get('height'):
-                        formats[-1]['quality'] = 1
-
-                return [{
-                    'id': flashvars['video_id'],
-                    'display_id': display_id,
-                    'title': title,
-                    'thumbnail': thumbnail,
-                    'formats': formats,
-                }]
         if not found:
             # Broaden the search a little bit
             found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
index 2d9b9a742513e4d215d96838e5e1524391e8b981..d1fc058b92bc1f8f1e26ca90f384329701b05852 100644 (file)
+import re
+
 from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    get_element_by_class,
+    int_or_none,
+    merge_dicts,
+    url_or_none,
+)
+
+
+class PeekVidsBaseIE(InfoExtractor):
+    def _real_extract(self, url):
+        domain, video_id = self._match_valid_url(url).group('domain', 'id')
+        webpage = self._download_webpage(url, video_id, expected_status=429)
+        if '>Rate Limit Exceeded' in webpage:
+            raise ExtractorError(
+                f'You are suspected as a bot. Wait, or pass the captcha on the site and provide cookies. {self._login_hint()}',
+                video_id=video_id, expected=True)
+
+        title = self._html_search_regex(r'(?s)<h1\b[^>]*>(.+?)</h1>', webpage, 'title')
+
+        display_id = video_id
+        video_id = self._search_regex(r'(?s)<video\b[^>]+\bdata-id\s*=\s*["\']?([\w-]+)', webpage, 'short video ID')
+        srcs = self._download_json(
+            f'https://www.{domain}/v-alt/{video_id}', video_id,
+            note='Downloading list of source files')
+
+        formats = []
+        for k, v in srcs.items():
+            f_url = url_or_none(v)
+            if not f_url:
+                continue
+
+            height = self._search_regex(r'^data-src(\d{3,})$', k, 'height', default=None)
+            if not height:
+                continue
+
+            formats.append({
+                'url': f_url,
+                'format_id': height,
+                'height': int_or_none(height),
+            })
+
+        if not formats:
+            formats = [{'url': url} for url in srcs.values()]
 
+        info = self._search_json_ld(webpage, video_id, expected_type='VideoObject', default={})
+        info.pop('url', None)
 
-class PeekVidsIE(InfoExtractor):
+        # may not have found the thumbnail if it was in a list in the ld+json
+        info.setdefault('thumbnail', self._og_search_thumbnail(webpage))
+        detail = (get_element_by_class('detail-video-block', webpage)
+                  or get_element_by_class('detail-block', webpage) or '')
+        info['description'] = self._html_search_regex(
+            rf'(?s)(.+?)(?:{re.escape(info.get("description", ""))}\s*<|<ul\b)',
+            detail, 'description', default=None) or None
+        info['title'] = re.sub(r'\s*[,-][^,-]+$', '', info.get('title') or title) or self._generic_title(url)
+
+        def cat_tags(name, html):
+            l = self._html_search_regex(
+                rf'(?s)<span\b[^>]*>\s*{re.escape(name)}\s*:\s*</span>(.+?)</li>',
+                html, name, default='')
+            return list(filter(None, re.split(r'\s+', l)))
+
+        return merge_dicts({
+            'id': video_id,
+            'display_id': display_id,
+            'age_limit': 18,
+            'formats': formats,
+            'categories': cat_tags('Categories', detail),
+            'tags': cat_tags('Tags', detail),
+            'uploader': self._html_search_regex(r'[Uu]ploaded\s+by\s(.+?)"', webpage, 'uploader', default=None),
+        }, info)
+
+
+class PeekVidsIE(PeekVidsBaseIE):
     _VALID_URL = r'''(?x)
-        https?://(?:www\.)?peekvids\.com/
+        https?://(?:www\.)?(?P<domain>peekvids\.com)/
         (?:(?:[^/?#]+/){2}|embed/?\?(?:[^#]*&)?v=)
         (?P<id>[^/?&#]*)
     '''
     _TESTS = [{
         'url': 'https://peekvids.com/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp/BSyLMbN0YCd',
-        'md5': 'a00940646c428e232407e3e62f0e8ef5',
+        'md5': '2ff6a357a9717dc9dc9894b51307e9a2',
         'info_dict': {
-            'id': 'BSyLMbN0YCd',
-            'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp, SEXYhub',
+            'id': '1262717',
+            'display_id': 'BSyLMbN0YCd',
+            'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp',
             'ext': 'mp4',
             'thumbnail': r're:^https?://.*\.jpg$',
-            'description': 'Watch  Dane Jones - Cute redhead with perfect tits with Mini Vamp (7 min), uploaded by SEXYhub.com',
+            'description': 'md5:0a61df3620de26c0af8963b1a730cd69',
             'timestamp': 1642579329,
             'upload_date': '20220119',
             'duration': 416,
             'view_count': int,
             'age_limit': 18,
+            'uploader': 'SEXYhub.com',
+            'categories': list,
+            'tags': list,
         },
     }]
-    _DOMAIN = 'www.peekvids.com'
 
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-
-        short_video_id = self._html_search_regex(r'<video [^>]*data-id="(.+?)"', webpage, 'short video ID')
-        srcs = self._download_json(
-            f'https://{self._DOMAIN}/v-alt/{short_video_id}', video_id,
-            note='Downloading list of source files')
-        formats = [{
-            'url': url,
-            'ext': 'mp4',
-            'format_id': name[8:],
-        } for name, url in srcs.items() if len(name) > 8 and name.startswith('data-src')]
-        if not formats:
-            formats = [{'url': url} for url in srcs.values()]
 
-        info = self._search_json_ld(webpage, video_id, expected_type='VideoObject')
-        info.update({
-            'id': video_id,
-            'age_limit': 18,
-            'formats': formats,
-        })
-        return info
-
-
-class PlayVidsIE(PeekVidsIE):  # XXX: Do not subclass from concrete IE
-    _VALID_URL = r'https?://(?:www\.)?playvids\.com/(?:embed/|[^/]{2}/)?(?P<id>[^/?#]*)'
+class PlayVidsIE(PeekVidsBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?(?P<domain>playvids\.com)/(?:embed/|\w\w?/)?(?P<id>[^/?#]*)'
     _TESTS = [{
         'url': 'https://www.playvids.com/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp',
-        'md5': 'cd7dfd8a2e815a45402369c76e3c1825',
+        'md5': '2f12e50213dd65f142175da633c4564c',
         'info_dict': {
-            'id': 'U3pBrYhsjXM',
-            'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp, SEXYhub',
+            'id': '1978030',
+            'display_id': 'U3pBrYhsjXM',
+            'title': ' Dane Jones - Cute redhead with perfect tits with Mini Vamp',
             'ext': 'mp4',
             'thumbnail': r're:^https?://.*\.jpg$',
-            'description': 'Watch  Dane Jones - Cute redhead with perfect tits with Mini Vamp video in HD, uploaded by SEXYhub.com',
+            'description': 'md5:0a61df3620de26c0af8963b1a730cd69',
             'timestamp': 1640435839,
             'upload_date': '20211225',
             'duration': 416,
             'view_count': int,
             'age_limit': 18,
+            'uploader': 'SEXYhub.com',
+            'categories': list,
+            'tags': list,
         },
     }, {
         'url': 'https://www.playvids.com/es/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp',
@@ -73,5 +130,62 @@ class PlayVidsIE(PeekVidsIE):  # XXX: Do not subclass from concrete IE
     }, {
         'url': 'https://www.playvids.com/embed/U3pBrYhsjXM',
         'only_matching': True,
+    }, {
+        'url': 'https://www.playvids.com/bKmGLe3IwjZ/sv/brazzers-800-phone-sex-madison-ivy-always-on-the-line',
+        'md5': 'e783986e596cafbf46411a174ab42ba6',
+        'info_dict': {
+            'id': '762385',
+            'display_id': 'bKmGLe3IwjZ',
+            'ext': 'mp4',
+            'title': 'Brazzers - 1 800 Phone Sex: Madison Ivy Always On The Line 6',
+            'description': 'md5:bdcd2db2b8ad85831a491d7c8605dcef',
+            'timestamp': 1516958544,
+            'upload_date': '20180126',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 480,
+            'uploader': 'Brazzers',
+            'age_limit': 18,
+            'view_count': int,
+            'age_limit': 18,
+            'categories': list,
+            'tags': list,
+        },
+    }, {
+        'url': 'https://www.playvids.com/v/47iUho33toY',
+        'md5': 'b056b5049d34b648c1e86497cf4febce',
+        'info_dict': {
+            'id': '700621',
+            'display_id': '47iUho33toY',
+            'ext': 'mp4',
+            'title': 'KATEE OWEN STRIPTIASE IN SEXY RED LINGERIE',
+            'description': None,
+            'timestamp': 1507052209,
+            'upload_date': '20171003',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 332,
+            'uploader': 'Cacerenele',
+            'age_limit': 18,
+            'view_count': int,
+            'categories': list,
+            'tags': list,
+        },
+    }, {
+        'url': 'https://www.playvids.com/z3_7iwWCmqt/sexy-teen-filipina-striptease-beautiful-pinay-bargirl-strips-and-dances',
+        'md5': 'efa09be9f031314b7b7e3bc6510cd0df',
+        'info_dict': {
+            'id': '1523518',
+            'display_id': 'z3_7iwWCmqt',
+            'ext': 'mp4',
+            'title': 'SEXY TEEN FILIPINA STRIPTEASE - Beautiful Pinay Bargirl Strips and Dances',
+            'description': None,
+            'timestamp': 1607470323,
+            'upload_date': '20201208',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 593,
+            'uploader': 'yorours',
+            'age_limit': 18,
+            'view_count': int,
+            'categories': list,
+            'tags': list,
+        },
     }]
-    _DOMAIN = 'www.playvids.com'
diff --git a/yt_dlp/extractor/thisvid.py b/yt_dlp/extractor/thisvid.py
new file mode 100644 (file)
index 0000000..9d3368e
--- /dev/null
@@ -0,0 +1,226 @@
+import itertools
+import re
+import urllib.parse
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    get_element_by_class,
+    int_or_none,
+    url_or_none,
+    urljoin,
+)
+
+
+class ThisVidIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?thisvid\.com/(?P<type>videos|embed)/(?P<id>[A-Za-z0-9-]+)'
+    _TESTS = [{
+        'url': 'https://thisvid.com/videos/sitting-on-ball-tight-jeans/',
+        'md5': '839becb572995687e11a69dc4358a386',
+        'info_dict': {
+            'id': '3533241',
+            'ext': 'mp4',
+            'title': 'Sitting on ball tight jeans',
+            'description': 'md5:372353bb995883d1b65fddf507489acd',
+            'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg',
+            'uploader_id': '150629',
+            'uploader': 'jeanslevisjeans',
+            'display_id': 'sitting-on-ball-tight-jeans',
+            'age_limit': 18,
+        }
+    }, {
+        'url': 'https://thisvid.com/embed/3533241/',
+        'md5': '839becb572995687e11a69dc4358a386',
+        'info_dict': {
+            'id': '3533241',
+            'ext': 'mp4',
+            'title': 'Sitting on ball tight jeans',
+            'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg',
+            'uploader_id': '150629',
+            'uploader': 'jeanslevisjeans',
+            'display_id': 'sitting-on-ball-tight-jeans',
+            'age_limit': 18,
+        }
+    }]
+
+    def _real_extract(self, url):
+        main_id, type_ = re.match(self._VALID_URL, url).group('id', 'type')
+        webpage = self._download_webpage(url, main_id)
+
+        title = self._html_search_regex(
+            r'<title\b[^>]*?>(?:Video:\s+)?(.+?)(?:\s+-\s+ThisVid(?:\.com| tube))?</title>',
+            webpage, 'title')
+
+        if type_ == 'embed':
+            # look for more metadata
+            video_alt_url = url_or_none(self._search_regex(
+                rf'''video_alt_url\s*:\s+'({self._VALID_URL}/)',''',
+                webpage, 'video_alt_url', default=None))
+            if video_alt_url and video_alt_url != url:
+                webpage = self._download_webpage(
+                    video_alt_url, main_id,
+                    note='Redirecting embed to main page', fatal=False) or webpage
+
+        video_holder = get_element_by_class('video-holder', webpage) or ''
+        if '>This video is a private video' in video_holder:
+            self.raise_login_required(
+                (clean_html(video_holder) or 'Private video').partition('\n')[0])
+
+        uploader = self._html_search_regex(
+            r'''(?s)<span\b[^>]*>Added by:\s*</span><a\b[^>]+\bclass\s*=\s*["']author\b[^>]+\bhref\s*=\s*["']https://thisvid\.com/members/([0-9]+/.{3,}?)\s*</a>''',
+            webpage, 'uploader', default='')
+        uploader = re.split(r'''/["'][^>]*>\s*''', uploader)
+        if len(uploader) == 2:
+            # id must be non-empty, uploader could be ''
+            uploader_id, uploader = uploader
+            uploader = uploader or None
+        else:
+            uploader_id = uploader = None
+
+        return self.url_result(
+            url, ie='Generic', url_transparent=True,
+            title=title,
+            age_limit=18,
+            uploader=uploader,
+            uploader_id=uploader_id)
+
+
+class ThisVidPlaylistBaseIE(InfoExtractor):
+    _PLAYLIST_URL_RE = None
+
+    @classmethod
+    def _find_urls(cls, html):
+        for m in re.finditer(rf'''<a\b[^>]+\bhref\s*=\s*["'](?P<url>{cls._PLAYLIST_URL_RE}\b)[^>]+>''', html):
+            yield m.group('url')
+
+    def _generate_playlist_entries(self, url, playlist_id, html=None):
+        page_url = url
+        for page in itertools.count(1):
+            if not html:
+                html = self._download_webpage(
+                    page_url, playlist_id, note=f'Downloading page {page}',
+                    fatal=False) or ''
+
+            yield from self._find_urls(html)
+
+            next_page = get_element_by_class('pagination-next', html) or ''
+            if next_page:
+                # member list page
+                next_page = urljoin(url, self._search_regex(
+                    r'''<a\b[^>]+\bhref\s*=\s*("|')(?P<url>(?!#)(?:(?!\1).)+)''',
+                    next_page, 'next page link', group='url', default=None))
+
+            # in case a member page should have pagination-next with empty link, not just `else:`
+            if next_page is None:
+                # playlist page
+                parsed_url = urllib.parse.urlparse(page_url)
+                base_path, _, num = parsed_url.path.rpartition('/')
+                num = int_or_none(num)
+                if num is None:
+                    base_path, num = parsed_url.path.rstrip('/'), 1
+                parsed_url = parsed_url._replace(path=f'{base_path}/{num + 1}')
+                next_page = urllib.parse.urlunparse(parsed_url)
+                if page_url == next_page:
+                    next_page = None
+
+            if not next_page:
+                return
+            page_url, html = next_page, None
+
+    def _make_playlist_result(self, url):
+        playlist_id = self._match_id(url)
+        webpage = self._download_webpage(url, playlist_id)
+
+        title = re.split(
+            r'(?i)\s*\|\s*ThisVid\.com\s*$',
+            self._og_search_title(webpage, default=None)
+            or self._html_search_regex(r'(?s)<title\b[^>]*>(.+?)</title', webpage, 'title', fatal=False) or '', 1)[0] or None
+
+        return self.playlist_from_matches(
+            self._generate_playlist_entries(url, playlist_id, webpage),
+            playlist_id=playlist_id, playlist_title=title, ie=ThisVidIE)
+
+
+class ThisVidMemberIE(ThisVidPlaylistBaseIE):
+    _VALID_URL = r'https?://thisvid\.com/members/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://thisvid.com/members/2140501/',
+        'info_dict': {
+            'id': '2140501',
+            'title': 'Rafflesia\'s Profile',
+        },
+        'playlist_mincount': 16,
+    }, {
+        'url': 'https://thisvid.com/members/2140501/favourite_videos/',
+        'info_dict': {
+            'id': '2140501',
+            'title': 'Rafflesia\'s Favourite Videos',
+        },
+        'playlist_mincount': 15,
+    }, {
+        'url': 'https://thisvid.com/members/636468/public_videos/',
+        'info_dict': {
+            'id': '636468',
+            'title': 'Happymouth\'s Public Videos',
+        },
+        'playlist_mincount': 196,
+    }]
+    _PLAYLIST_URL_RE = ThisVidIE._VALID_URL
+
+    def _real_extract(self, url):
+        return self._make_playlist_result(url)
+
+
+class ThisVidPlaylistIE(ThisVidPlaylistBaseIE):
+    _VALID_URL = r'https?://thisvid\.com/playlist/(?P<id>\d+)/video/(?P<video_id>[A-Za-z0-9-]+)'
+    _TESTS = [{
+        'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/',
+        'info_dict': {
+            'id': '6615',
+            'title': 'Underwear Stuff',
+        },
+        'playlist_mincount': 200,
+    }, {
+        'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/',
+        'info_dict': {
+            'id': '1072387',
+            'ext': 'mp4',
+            'title': 'Big Italian Booty 28',
+            'description': 'md5:1bccf7b13765e18fb27bf764dba7ede2',
+            'uploader_id': '367912',
+            'uploader': 'Jcmusclefun',
+            'age_limit': 18,
+            'display_id': 'big-italian-booty-28',
+            'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+1072387/preview\.jpg',
+        },
+        'params': {
+            'noplaylist': True,
+        },
+    }]
+    _PLAYLIST_URL_RE = _VALID_URL
+
+    def _generate_playlist_entries(self, url, playlist_id, html=None):
+        for wrapped_url in super()._generate_playlist_entries(url, playlist_id, html):
+            video_id = re.match(self._VALID_URL, wrapped_url).group('video_id')
+            yield urljoin(url, f'/videos/{video_id}/')
+
+    def _real_extract(self, url):
+        playlist_id, video_id = self._match_valid_url(url).group('id', 'video_id')
+
+        if not self._yes_playlist(playlist_id, video_id):
+            redirect_url = urljoin(url, f'/videos/{video_id}/')
+            return self.url_result(redirect_url, ThisVidIE)
+
+        result = self._make_playlist_result(url)
+
+        # Fix duplicated title (`the title - the title` => `the title`)
+        title = result['title']
+        t_len = len(title)
+        if t_len > 5 and t_len % 2 != 0:
+            t_len = t_len // 2
+            if title[t_len] == '-':
+                first, second = map(str.strip, (title[:t_len], title[t_len + 1:]))
+                if first and first == second:
+                    result['title'] = first
+
+        return result