[ie/orf:on] Improve extraction (#9677)

[yt-dlp.git] / yt_dlp / extractor / mildom.py
diff --git a/yt_dlp/extractor/mildom.py b/yt_dlp/extractor/mildom.py

index 3606f09b4eb6f29d3c7ae20c382119307b37d87e..f64d575dcc8126f797bdf4e09804c0ca2d5ae28c 100644 (file)
--- a/yt_dlp/extractor/mildom.py
+++ b/yt_dlp/extractor/mildom.py
@@ -1,100 +1,39 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from datetime import datetime
-import itertools
+import functools
  import json
-import base64
-import re
+import uuid
  
  from .common import InfoExtractor
  from ..utils import (
-    ExtractorError, std_headers,
-    update_url_query,
-    random_uuidv4,
-    try_get,
-)
-from ..compat import (
-    compat_urlparse,
-    compat_urllib_parse_urlencode,
-    compat_str,
+    determine_ext,
+    dict_get,
+    ExtractorError,
+    float_or_none,
+    OnDemandPagedList,
+    traverse_obj,
  )
  
  
  class MildomBaseIE(InfoExtractor):
      _GUEST_ID = None
-    _DISPATCHER_CONFIG = None
-
-    def _call_api(self, url, video_id, query={}, note='Downloading JSON metadata', init=False):
-        url = update_url_query(url, self._common_queries(query, init=init))
-        return self._download_json(url, video_id, note=note)['body']
-
-    def _common_queries(self, query={}, init=False):
-        dc = self._fetch_dispatcher_config()
-        r = {
-            'timestamp': self.iso_timestamp(),
-            '__guest_id': '' if init else self.guest_id(),
-            '__location': dc['location'],
-            '__country': dc['country'],
-            '__cluster': dc['cluster'],
-            '__platform': 'web',
-            '__la': self.lang_code(),
-            '__pcv': 'v2.9.44',
-            'sfr': 'pc',
-            'accessToken': '',
-        }
-        r.update(query)
-        return r
-
-    def _fetch_dispatcher_config(self):
-        if not self._DISPATCHER_CONFIG:
-            try:
-                tmp = self._download_json(
-                    'https://disp.mildom.com/serverListV2', 'initialization',
-                    note='Downloading dispatcher_config', data=json.dumps({
-                        'protover': 0,
-                        'data': base64.b64encode(json.dumps({
-                            'fr': 'web',
-                            'sfr': 'pc',
-                            'devi': 'Windows',
-                            'la': 'ja',
-                            'gid': None,
-                            'loc': '',
-                            'clu': '',
-                            'wh': '1919*810',
-                            'rtm': self.iso_timestamp(),
-                            'ua': std_headers['User-Agent'],
-                        }).encode('utf8')).decode('utf8').replace('\n', ''),
-                    }).encode('utf8'))
-                self._DISPATCHER_CONFIG = self._parse_json(base64.b64decode(tmp['data']), 'initialization')
-            except ExtractorError:
-                self._DISPATCHER_CONFIG = self._download_json(
-                    'https://bookish-octo-barnacle.vercel.app/api/mildom/dispatcher_config', 'initialization',
-                    note='Downloading dispatcher_config fallback')
-        return self._DISPATCHER_CONFIG
-
-    @staticmethod
-    def iso_timestamp():
-        'new Date().toISOString()'
-        return datetime.utcnow().isoformat()[0:-3] + 'Z'
-
-    def guest_id(self):
-        'getGuestId'
-        if self._GUEST_ID:
-            return self._GUEST_ID
-        self._GUEST_ID = try_get(
-            self, (
-                lambda x: x._call_api(
-                    'https://cloudac.mildom.com/nonolive/gappserv/guest/h5init', 'initialization',
-                    note='Downloading guest token', init=True)['guest_id'] or None,
-                lambda x: x._get_cookies('https://www.mildom.com').get('gid').value,
-                lambda x: x._get_cookies('https://m.mildom.com').get('gid').value,
-            ), compat_str) or ''
-        return self._GUEST_ID
-
-    def lang_code(self):
-        'getCurrentLangCode'
-        return 'ja'
+
+    def _call_api(self, url, video_id, query=None, note='Downloading JSON metadata', body=None):
+        if not self._GUEST_ID:
+            self._GUEST_ID = f'pc-gp-{str(uuid.uuid4())}'
+
+        content = self._download_json(
+            url, video_id, note=note, data=json.dumps(body).encode() if body else None,
+            headers={'Content-Type': 'application/json'} if body else {},
+            query={
+                '__guest_id': self._GUEST_ID,
+                '__platform': 'web',
+                **(query or {}),
+            })
+
+        if content['code'] != 0:
+            raise ExtractorError(
+                f'Mildom says: {content["message"]} (code {content["code"]})',
+                expected=True)
+        return content['body']
  
  
  class MildomIE(MildomBaseIE):
@@ -104,31 +43,13 @@ class MildomIE(MildomBaseIE):
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
-        url = 'https://www.mildom.com/%s' % video_id
-
-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(f'https://www.mildom.com/{video_id}', video_id)
  
          enterstudio = self._call_api(
              'https://cloudac.mildom.com/nonolive/gappserv/live/enterstudio', video_id,
              note='Downloading live metadata', query={'user_id': video_id})
          result_video_id = enterstudio.get('log_id', video_id)
  
-        title = try_get(
-            enterstudio, (
-                lambda x: self._html_search_meta('twitter:description', webpage),
-                lambda x: x['anchor_intro'],
-            ), compat_str)
-        description = try_get(
-            enterstudio, (
-                lambda x: x['intro'],
-                lambda x: x['live_intro'],
-            ), compat_str)
-        uploader = try_get(
-            enterstudio, (
-                lambda x: self._html_search_meta('twitter:title', webpage),
-                lambda x: x['loginname'],
-            ), compat_str)
-
          servers = self._call_api(
              'https://cloudac.mildom.com/nonolive/gappserv/live/liveserver', result_video_id,
              note='Downloading live server list', query={
@@ -136,32 +57,29 @@ def _real_extract(self, url):
                  'live_server_type': 'hls',
              })
  
-        stream_query = self._common_queries({
-            'streamReqId': random_uuidv4(),
-            'is_lhls': '0',
-        })
-        m3u8_url = update_url_query(servers['stream_server'] + '/%s_master.m3u8' % video_id, stream_query)
-        formats = self._extract_m3u8_formats(m3u8_url, result_video_id, 'mp4', headers={
-            'Referer': 'https://www.mildom.com/',
-            'Origin': 'https://www.mildom.com',
-        }, note='Downloading m3u8 information')
-        del stream_query['streamReqId'], stream_query['timestamp']
-        for fmt in formats:
-            # Uses https://github.com/nao20010128nao/bookish-octo-barnacle by @nao20010128nao as a proxy
-            parsed = compat_urlparse.urlparse(fmt['url'])
-            parsed = parsed._replace(
-                netloc='bookish-octo-barnacle.vercel.app',
-                query=compat_urllib_parse_urlencode(stream_query, True),
-                path='/api/mildom' + parsed.path)
-            fmt['url'] = compat_urlparse.urlunparse(parsed)
+        playback_token = self._call_api(
+            'https://cloudac.mildom.com/nonolive/gappserv/live/token', result_video_id,
+            note='Obtaining live playback token', body={'host_id': video_id, 'type': 'hls'})
+        playback_token = traverse_obj(playback_token, ('data', ..., 'token'), get_all=False)
+        if not playback_token:
+            raise ExtractorError('Failed to obtain live playback token')
+
+        formats = self._extract_m3u8_formats(
+            f'{servers["stream_server"]}/{video_id}_master.m3u8?{playback_token}',
+            result_video_id, 'mp4', headers={
+                'Referer': 'https://www.mildom.com/',
+                'Origin': 'https://www.mildom.com',
+            })
  
-        self._sort_formats(formats)
+        for fmt in formats:
+            fmt.setdefault('http_headers', {})['Referer'] = 'https://www.mildom.com/'
  
          return {
              'id': result_video_id,
-            'title': title,
-            'description': description,
-            'uploader': uploader,
+            'title': self._html_search_meta('twitter:description', webpage, default=None) or traverse_obj(enterstudio, 'anchor_intro'),
+            'description': traverse_obj(enterstudio, 'intro', 'live_intro', expected_type=str),
+            'timestamp': float_or_none(enterstudio.get('live_start_ms'), scale=1000),
+            'uploader': self._html_search_meta('twitter:title', webpage, default=None) or traverse_obj(enterstudio, 'loginname'),
              'uploader_id': video_id,
              'formats': formats,
              'is_live': True,
@@ -170,15 +88,55 @@ def _real_extract(self, url):
  
  class MildomVodIE(MildomBaseIE):
      IE_NAME = 'mildom:vod'
-    IE_DESC = 'Download a VOD in Mildom'
-    _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/playback/(?P<user_id>\d+)/(?P<id>(?P=user_id)-[a-zA-Z0-9]+)'
+    IE_DESC = 'VOD in Mildom'
+    _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/playback/(?P<user_id>\d+)/(?P<id>(?P=user_id)-[a-zA-Z0-9]+-?[0-9]*)'
+    _TESTS = [{
+        'url': 'https://www.mildom.com/playback/10882672/10882672-1597662269',
+        'info_dict': {
+            'id': '10882672-1597662269',
+            'ext': 'mp4',
+            'title': '始めてのミルダム配信じゃぃ！',
+            'thumbnail': r're:^https?://.*\.(png|jpg)$',
+            'upload_date': '20200817',
+            'duration': 4138.37,
+            'description': 'ゲームをしたくて！',
+            'timestamp': 1597662269.0,
+            'uploader_id': '10882672',
+            'uploader': 'kson組長(けいそん)',
+        },
+    }, {
+        'url': 'https://www.mildom.com/playback/10882672/10882672-1597758589870-477',
+        'info_dict': {
+            'id': '10882672-1597758589870-477',
+            'ext': 'mp4',
+            'title': '【kson】感染メイズ！麻酔銃で無双する',
+            'thumbnail': r're:^https?://.*\.(png|jpg)$',
+            'timestamp': 1597759093.0,
+            'uploader': 'kson組長(けいそん)',
+            'duration': 4302.58,
+            'uploader_id': '10882672',
+            'description': 'このステージ絶対乗り越えたい',
+            'upload_date': '20200818',
+        },
+    }, {
+        'url': 'https://www.mildom.com/playback/10882672/10882672-buha9td2lrn97fk2jme0',
+        'info_dict': {
+            'id': '10882672-buha9td2lrn97fk2jme0',
+            'ext': 'mp4',
+            'title': '【kson組長】CART RACER!!!',
+            'thumbnail': r're:^https?://.*\.(png|jpg)$',
+            'uploader_id': '10882672',
+            'uploader': 'kson組長(けいそん)',
+            'upload_date': '20201104',
+            'timestamp': 1604494797.0,
+            'duration': 4657.25,
+            'description': 'WTF',
+        },
+    }]
  
      def _real_extract(self, url):
-        m = re.match(self._VALID_URL, url)
-        user_id, video_id = m.group('user_id'), m.group('id')
-        url = 'https://www.mildom.com/playback/%s/%s' % (user_id, video_id)
-
-        webpage = self._download_webpage(url, video_id)
+        user_id, video_id = self._match_valid_url(url).group('user_id', 'id')
+        webpage = self._download_webpage(f'https://www.mildom.com/playback/{user_id}/{video_id}', video_id)
  
          autoplay = self._call_api(
              'https://cloudac.mildom.com/nonolive/videocontent/playback/getPlaybackDetail', video_id,
@@ -186,30 +144,16 @@ def _real_extract(self, url):
                  'v_id': video_id,
              })['playback']
  
-        title = try_get(
-            autoplay, (
-                lambda x: self._html_search_meta('og:description', webpage),
-                lambda x: x['title'],
-            ), compat_str)
-        description = try_get(
-            autoplay, (
-                lambda x: x['video_intro'],
-            ), compat_str)
-        uploader = try_get(
-            autoplay, (
-                lambda x: x['author_info']['login_name'],
-            ), compat_str)
-
-        audio_formats = [{
+        formats = [{
              'url': autoplay['audio_url'],
              'format_id': 'audio',
              'protocol': 'm3u8_native',
              'vcodec': 'none',
              'acodec': 'aac',
+            'ext': 'm4a'
          }]
-        video_formats = []
          for fmt in autoplay['video_link']:
-            video_formats.append({
+            formats.append({
                  'format_id': 'video-%s' % fmt['name'],
                  'url': fmt['url'],
                  'protocol': 'm3u8_native',
@@ -217,32 +161,83 @@ def _real_extract(self, url):
                  'height': fmt['level'],
                  'vcodec': 'h264',
                  'acodec': 'aac',
+                'ext': 'mp4'
              })
  
-        stream_query = self._common_queries({
-            'is_lhls': '0',
-        })
-        del stream_query['timestamp']
-        formats = audio_formats + video_formats
-        for fmt in formats:
-            fmt['ext'] = 'mp4'
-            parsed = compat_urlparse.urlparse(fmt['url'])
-            stream_query['path'] = parsed.path[5:]
-            parsed = parsed._replace(
-                netloc='bookish-octo-barnacle.vercel.app',
-                query=compat_urllib_parse_urlencode(stream_query, True),
-                path='/api/mildom/vod2/proxy')
-            fmt['url'] = compat_urlparse.urlunparse(parsed)
+        return {
+            'id': video_id,
+            'title': self._html_search_meta(('og:description', 'description'), webpage, default=None) or autoplay.get('title'),
+            'description': traverse_obj(autoplay, 'video_intro'),
+            'timestamp': float_or_none(autoplay.get('publish_time'), scale=1000),
+            'duration': float_or_none(autoplay.get('video_length'), scale=1000),
+            'thumbnail': dict_get(autoplay, ('upload_pic', 'video_pic')),
+            'uploader': traverse_obj(autoplay, ('author_info', 'login_name')),
+            'uploader_id': user_id,
+            'formats': formats,
+        }
+
  
-        self._sort_formats(formats)
+class MildomClipIE(MildomBaseIE):
+    IE_NAME = 'mildom:clip'
+    IE_DESC = 'Clip in Mildom'
+    _VALID_URL = r'https?://(?:(?:www|m)\.)mildom\.com/clip/(?P<id>(?P<user_id>\d+)-[a-zA-Z0-9]+)'
+    _TESTS = [{
+        'url': 'https://www.mildom.com/clip/10042245-63921673e7b147ebb0806d42b5ba5ce9',
+        'info_dict': {
+            'id': '10042245-63921673e7b147ebb0806d42b5ba5ce9',
+            'title': '全然違ったよ',
+            'timestamp': 1619181890,
+            'duration': 59,
+            'thumbnail': r're:https?://.+',
+            'uploader': 'ざきんぽ',
+            'uploader_id': '10042245',
+        },
+    }, {
+        'url': 'https://www.mildom.com/clip/10111524-ebf4036e5aa8411c99fb3a1ae0902864',
+        'info_dict': {
+            'id': '10111524-ebf4036e5aa8411c99fb3a1ae0902864',
+            'title': 'かっこいい',
+            'timestamp': 1621094003,
+            'duration': 59,
+            'thumbnail': r're:https?://.+',
+            'uploader': '(ルーキー',
+            'uploader_id': '10111524',
+        },
+    }, {
+        'url': 'https://www.mildom.com/clip/10660174-2c539e6e277c4aaeb4b1fbe8d22cb902',
+        'info_dict': {
+            'id': '10660174-2c539e6e277c4aaeb4b1fbe8d22cb902',
+            'title': 'あ',
+            'timestamp': 1614769431,
+            'duration': 31,
+            'thumbnail': r're:https?://.+',
+            'uploader': 'ドルゴルスレンギーン＝ダグワドルジ',
+            'uploader_id': '10660174',
+        },
+    }]
+
+    def _real_extract(self, url):
+        user_id, video_id = self._match_valid_url(url).group('user_id', 'id')
+        webpage = self._download_webpage(f'https://www.mildom.com/clip/{video_id}', video_id)
+
+        clip_detail = self._call_api(
+            'https://cloudac-cf-jp.mildom.com/nonolive/videocontent/clip/detail', video_id,
+            note='Downloading playback metadata', query={
+                'clip_id': video_id,
+            })
  
          return {
              'id': video_id,
-            'title': title,
-            'description': description,
-            'uploader': uploader,
+            'title': self._html_search_meta(
+                ('og:description', 'description'), webpage, default=None) or clip_detail.get('title'),
+            'timestamp': float_or_none(clip_detail.get('create_time')),
+            'duration': float_or_none(clip_detail.get('length')),
+            'thumbnail': clip_detail.get('cover'),
+            'uploader': traverse_obj(clip_detail, ('user_info', 'loginname')),
              'uploader_id': user_id,
-            'formats': formats,
+
+            'url': clip_detail['url'],
+            'ext': determine_ext(clip_detail.get('url'), 'mp4'),
          }
  
  
@@ -256,30 +251,41 @@ class MildomUserVodIE(MildomBaseIE):
              'id': '10093333',
              'title': 'Uploads from ねこばたけ',
          },
-        'playlist_mincount': 351,
+        'playlist_mincount': 732,
+    }, {
+        'url': 'https://www.mildom.com/profile/10882672',
+        'info_dict': {
+            'id': '10882672',
+            'title': 'Uploads from kson組長(けいそん)',
+        },
+        'playlist_mincount': 201,
      }]
  
+    def _fetch_page(self, user_id, page):
+        page += 1
+        reply = self._call_api(
+            'https://cloudac.mildom.com/nonolive/videocontent/profile/playbackList',
+            user_id, note=f'Downloading page {page}', query={
+                'user_id': user_id,
+                'page': page,
+                'limit': '30',
+            })
+        if not reply:
+            return
+        for x in reply:
+            v_id = x.get('v_id')
+            if not v_id:
+                continue
+            yield self.url_result(f'https://www.mildom.com/playback/{user_id}/{v_id}')
+
      def _real_extract(self, url):
          user_id = self._match_id(url)
-
-        self.report_warning('To download ongoing live, please use "https://www.mildom.com/%s" instead. This will list up VODs belonging to user.' % user_id)
+        self.to_screen('This will download all VODs belonging to user. To download ongoing live video, use "https://www.mildom.com/%s" instead' % user_id)
  
          profile = self._call_api(
              'https://cloudac.mildom.com/nonolive/gappserv/user/profileV2', user_id,
              query={'user_id': user_id}, note='Downloading user profile')['user_info']
  
-        results = []
-        for page in itertools.count(1):
-            reply = self._call_api(
-                'https://cloudac.mildom.com/nonolive/videocontent/profile/playbackList',
-                user_id, note='Downloading page %d' % page, query={
-                    'user_id': user_id,
-                    'page': page,
-                    'limit': '30',
-                })
-            if not reply:
-                break
-            results.extend('https://www.mildom.com/playback/%s/%s' % (user_id, x['v_id']) for x in reply)
-        return self.playlist_result([
-            self.url_result(u, ie=MildomVodIE.ie_key()) for u in results
-        ], user_id, 'Uploads from %s' % profile['loginname'])
+        return self.playlist_result(
+            OnDemandPagedList(functools.partial(self._fetch_page, user_id), 30),
+            user_id, f'Uploads from {profile["loginname"]}')