yt_dlp/extractor/motherless.py

   1 import datetime
   2 import re
   3 import urllib.parse
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     ExtractorError,
   8     OnDemandPagedList,
   9     remove_end,
  10     str_to_int,
  11     unified_strdate,
  12 )
  13
  14
  15 class MotherlessIE(InfoExtractor):
  16     _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/|G[VIG]?[A-F0-9]+/)?(?P<id>[A-F0-9]+)'
  17     _TESTS = [{
  18         'url': 'http://motherless.com/EE97006',
  19         'md5': 'cb5e7438f7a3c4e886b7bccc1292a3bc',
  20         'info_dict': {
  21             'id': 'EE97006',
  22             'ext': 'mp4',
  23             'title': 'Dogging blond Brit getting glazed (comp)',
  24             'categories': ['UK', 'slag', 'whore', 'dogging', 'cunt', 'cumhound', 'big tits', 'Pearl Necklace'],
  25             'upload_date': '20230519',
  26             'uploader_id': 'deathbird',
  27             'thumbnail': r're:https?://.*\.jpg',
  28             'age_limit': 18,
  29             'comment_count': int,
  30             'view_count': int,
  31             'like_count': int,
  32         },
  33         'params': {
  34             # Incomplete cert chains
  35             'nocheckcertificate': True,
  36         },
  37     }, {
  38         'url': 'http://motherless.com/532291B',
  39         'md5': 'bc59a6b47d1f958e61fbd38a4d31b131',
  40         'info_dict': {
  41             'id': '532291B',
  42             'ext': 'mp4',
  43             'title': 'Amazing girl playing the omegle game, PERFECT!',
  44             'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen',
  45                            'game', 'hairy'],
  46             'upload_date': '20140622',
  47             'uploader_id': 'Sulivana7x',
  48             'thumbnail': r're:https?://.*\.jpg',
  49             'age_limit': 18,
  50         },
  51         'skip': '404',
  52     }, {
  53         'url': 'http://motherless.com/g/cosplay/633979F',
  54         'md5': '0b2a43f447a49c3e649c93ad1fafa4a0',
  55         'info_dict': {
  56             'id': '633979F',
  57             'ext': 'mp4',
  58             'title': 'Turtlette',
  59             'categories': ['superheroine heroine superher'],
  60             'upload_date': '20140827',
  61             'uploader_id': 'shade0230',
  62             'thumbnail': r're:https?://.*\.jpg',
  63             'age_limit': 18,
  64             'like_count': int,
  65             'comment_count': int,
  66             'view_count': int,
  67         },
  68         'params': {
  69             'nocheckcertificate': True,
  70         },
  71     }, {
  72         'url': 'http://motherless.com/8B4BBC1',
  73         'info_dict': {
  74             'id': '8B4BBC1',
  75             'ext': 'mp4',
  76             'title': 'VIDEO00441.mp4',
  77             'categories': [],
  78             'upload_date': '20160214',
  79             'uploader_id': 'NMWildGirl',
  80             'thumbnail': r're:https?://.*\.jpg',
  81             'age_limit': 18,
  82             'like_count': int,
  83             'comment_count': int,
  84             'view_count': int,
  85         },
  86         'params': {
  87             'nocheckcertificate': True,
  88         },
  89     }, {
  90         # see https://motherless.com/videos/recent for recent videos with
  91         # uploaded date in "ago" format
  92         'url': 'https://motherless.com/3C3E2CF',
  93         'info_dict': {
  94             'id': '3C3E2CF',
  95             'ext': 'mp4',
  96             'title': 'a/ Hot Teens',
  97             'categories': list,
  98             'upload_date': '20210104',
  99             'uploader_id': 'anonymous',
 100             'thumbnail': r're:https?://.*\.jpg',
 101             'age_limit': 18,
 102             'like_count': int,
 103             'comment_count': int,
 104             'view_count': int,
 105         },
 106         'params': {
 107             'nocheckcertificate': True,
 108         },
 109     }]
 110
 111     def _real_extract(self, url):
 112         video_id = self._match_id(url)
 113         webpage = self._download_webpage(url, video_id)
 114
 115         if any(p in webpage for p in (
 116                 '<title>404 - MOTHERLESS.COM<',
 117                 ">The page you're looking for cannot be found.<")):
 118             raise ExtractorError('Video %s does not exist' % video_id, expected=True)
 119
 120         if '>The content you are trying to view is for friends only.' in webpage:
 121             raise ExtractorError('Video %s is for friends only' % video_id, expected=True)
 122
 123         title = self._html_search_regex(
 124             (r'(?s)<div[^>]+\bclass=["\']media-meta-title[^>]+>(.+?)</div>',
 125              r'id="view-upload-title">\s+([^<]+)<'), webpage, 'title')
 126         video_url = (self._html_search_regex(
 127             (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
 128              r'fileurl\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1'),
 129             webpage, 'video URL', default=None, group='url')
 130             or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id)
 131         age_limit = self._rta_search(webpage)
 132         view_count = str_to_int(self._html_search_regex(
 133             (r'>([\d,.]+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'),
 134             webpage, 'view count', fatal=False))
 135         like_count = str_to_int(self._html_search_regex(
 136             (r'>([\d,.]+)\s+Favorites<',
 137              r'<strong>Favorited</strong>\s+([^<]+)<'),
 138             webpage, 'like count', fatal=False))
 139
 140         upload_date = unified_strdate(self._search_regex(
 141             r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', webpage,
 142             'upload date', default=None))
 143         if not upload_date:
 144             uploaded_ago = self._search_regex(
 145                 r'>\s*(\d+[hd])\s+[aA]go\b', webpage, 'uploaded ago',
 146                 default=None)
 147             if uploaded_ago:
 148                 delta = int(uploaded_ago[:-1])
 149                 _AGO_UNITS = {
 150                     'h': 'hours',
 151                     'd': 'days',
 152                 }
 153                 kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta}
 154                 upload_date = (datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(**kwargs)).strftime('%Y%m%d')
 155
 156         comment_count = len(re.findall(r'''class\s*=\s*['"]media-comment-contents\b''', webpage))
 157         uploader_id = self._html_search_regex(
 158             (r'''<span\b[^>]+\bclass\s*=\s*["']username\b[^>]*>([^<]+)</span>''',
 159              r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)'''),
 160             webpage, 'uploader_id', fatal=False)
 161         categories = self._html_search_meta('keywords', webpage, default='')
 162         categories = [cat.strip() for cat in categories.split(',') if cat.strip()]
 163
 164         return {
 165             'id': video_id,
 166             'title': title,
 167             'upload_date': upload_date,
 168             'uploader_id': uploader_id,
 169             'thumbnail': self._og_search_thumbnail(webpage),
 170             'categories': categories,
 171             'view_count': view_count,
 172             'like_count': like_count,
 173             'comment_count': comment_count,
 174             'age_limit': age_limit,
 175             'url': video_url,
 176         }
 177
 178
 179 class MotherlessPaginatedIE(InfoExtractor):
 180     _PAGE_SIZE = 60
 181
 182     def _correct_path(self, url, item_id):
 183         raise NotImplementedError('This method must be implemented by subclasses')
 184
 185     def _extract_entries(self, webpage, base):
 186         for mobj in re.finditer(r'href="[^"]*(?P<href>/[A-F0-9]+)"\s+title="(?P<title>[^"]+)',
 187                                 webpage):
 188             video_url = urllib.parse.urljoin(base, mobj.group('href'))
 189             video_id = MotherlessIE.get_temp_id(video_url)
 190
 191             if video_id:
 192                 yield self.url_result(video_url, MotherlessIE, video_id, mobj.group('title'))
 193
 194     def _real_extract(self, url):
 195         item_id = self._match_id(url)
 196         real_url = self._correct_path(url, item_id)
 197         webpage = self._download_webpage(real_url, item_id, 'Downloading page 1')
 198
 199         def get_page(idx):
 200             page = idx + 1
 201             current_page = webpage if not idx else self._download_webpage(
 202                 real_url, item_id, note=f'Downloading page {page}', query={'page': page})
 203             yield from self._extract_entries(current_page, real_url)
 204
 205         return self.playlist_result(
 206             OnDemandPagedList(get_page, self._PAGE_SIZE), item_id,
 207             remove_end(self._html_extract_title(webpage), ' | MOTHERLESS.COM ™'))
 208
 209
 210 class MotherlessGroupIE(MotherlessPaginatedIE):
 211     _VALID_URL = r'https?://(?:www\.)?motherless\.com/g[vifm]?/(?P<id>[a-z0-9_]+)/?(?:$|[#?])'
 212     _TESTS = [{
 213         'url': 'http://motherless.com/gv/movie_scenes',
 214         'info_dict': {
 215             'id': 'movie_scenes',
 216             'title': 'Movie Scenes',
 217         },
 218         'playlist_mincount': 540,
 219     }, {
 220         'url': 'http://motherless.com/g/sex_must_be_funny',
 221         'info_dict': {
 222             'id': 'sex_must_be_funny',
 223             'title': 'Sex must be funny',
 224         },
 225         'playlist_count': 0,
 226     }, {
 227         'url': 'https://motherless.com/gv/beautiful_cock',
 228         'info_dict': {
 229             'id': 'beautiful_cock',
 230             'title': 'Beautiful Cock',
 231         },
 232         'playlist_mincount': 2040,
 233     }]
 234
 235     def _correct_path(self, url, item_id):
 236         return urllib.parse.urljoin(url, f'/gv/{item_id}')
 237
 238
 239 class MotherlessGalleryIE(MotherlessPaginatedIE):
 240     _VALID_URL = r'https?://(?:www\.)?motherless\.com/G[VIG]?(?P<id>[A-F0-9]+)/?(?:$|[#?])'
 241     _TESTS = [{
 242         'url': 'https://motherless.com/GV338999F',
 243         'info_dict': {
 244             'id': '338999F',
 245             'title': 'Random',
 246         },
 247         'playlist_mincount': 190,
 248     }, {
 249         'url': 'https://motherless.com/GVABD6213',
 250         'info_dict': {
 251             'id': 'ABD6213',
 252             'title': 'Cuties',
 253         },
 254         'playlist_mincount': 2,
 255     }, {
 256         'url': 'https://motherless.com/GVBCF7622',
 257         'info_dict': {
 258             'id': 'BCF7622',
 259             'title': 'Vintage',
 260         },
 261         'playlist_count': 0,
 262     }, {
 263         'url': 'https://motherless.com/G035DE2F',
 264         'info_dict': {
 265             'id': '035DE2F',
 266             'title': 'General',
 267         },
 268         'playlist_mincount': 420,
 269     }]
 270
 271     def _correct_path(self, url, item_id):
 272         return urllib.parse.urljoin(url, f'/GV{item_id}')