yt_dlp/extractor/openrec.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     ExtractorError,
   7     get_first,
   8     int_or_none,
   9     traverse_obj,
  10     unified_strdate,
  11     unified_timestamp,
  12 )
  13 from ..compat import compat_str
  14
  15
  16 class OpenRecBaseIE(InfoExtractor):
  17     def _extract_pagestore(self, webpage, video_id):
  18         return self._parse_json(
  19             self._search_regex(r'(?m)window\.pageStore\s*=\s*(\{.+?\});$', webpage, 'window.pageStore'), video_id)
  20
  21     def _extract_movie(self, webpage, video_id, name, is_live):
  22         window_stores = self._extract_pagestore(webpage, video_id)
  23         movie_stores = [
  24             # extract all three important data (most of data are duplicated each other, but slightly different!)
  25             traverse_obj(window_stores, ('v8', 'state', 'movie'), expected_type=dict),
  26             traverse_obj(window_stores, ('v8', 'movie'), expected_type=dict),
  27             traverse_obj(window_stores, 'movieStore', expected_type=dict),
  28         ]
  29         if not any(movie_stores):
  30             raise ExtractorError(f'Failed to extract {name} info')
  31
  32         m3u8_playlists = get_first(movie_stores, 'media') or {}
  33         formats = []
  34         for name, m3u8_url in m3u8_playlists.items():
  35             if not m3u8_url:
  36                 continue
  37             formats.extend(self._extract_m3u8_formats(
  38                 m3u8_url, video_id, ext='mp4', live=is_live, m3u8_id=name))
  39
  40         self._sort_formats(formats)
  41
  42         return {
  43             'id': video_id,
  44             'title': get_first(movie_stores, 'title'),
  45             'description': get_first(movie_stores, 'introduction'),
  46             'thumbnail': get_first(movie_stores, 'thumbnailUrl'),
  47             'formats': formats,
  48             'uploader': get_first(movie_stores, ('channel', 'user', 'name')),
  49             'uploader_id': get_first(movie_stores, ('channel', 'user', 'id')),
  50             'timestamp': int_or_none(get_first(movie_stores, ['publishedAt', 'time']), scale=1000) or unified_timestamp(get_first(movie_stores, 'publishedAt')),
  51             'is_live': is_live,
  52         }
  53
  54
  55 class OpenRecIE(OpenRecBaseIE):
  56     IE_NAME = 'openrec'
  57     _VALID_URL = r'https?://(?:www\.)?openrec\.tv/live/(?P<id>[^/]+)'
  58     _TESTS = [{
  59         'url': 'https://www.openrec.tv/live/2p8v31qe4zy',
  60         'only_matching': True,
  61     }, {
  62         'url': 'https://www.openrec.tv/live/wez93eqvjzl',
  63         'only_matching': True,
  64     }]
  65
  66     def _real_extract(self, url):
  67         video_id = self._match_id(url)
  68         webpage = self._download_webpage(f'https://www.openrec.tv/live/{video_id}', video_id)
  69
  70         return self._extract_movie(webpage, video_id, 'live', True)
  71
  72
  73 class OpenRecCaptureIE(OpenRecBaseIE):
  74     IE_NAME = 'openrec:capture'
  75     _VALID_URL = r'https?://(?:www\.)?openrec\.tv/capture/(?P<id>[^/]+)'
  76     _TESTS = [{
  77         'url': 'https://www.openrec.tv/capture/l9nk2x4gn14',
  78         'only_matching': True,
  79     }, {
  80         'url': 'https://www.openrec.tv/capture/mldjr82p7qk',
  81         'info_dict': {
  82             'id': 'mldjr82p7qk',
  83             'title': 'たいじの恥ずかしい英語力',
  84             'uploader': 'たいちゃんねる',
  85             'uploader_id': 'Yaritaiji',
  86             'upload_date': '20210803',
  87         },
  88     }]
  89
  90     def _real_extract(self, url):
  91         video_id = self._match_id(url)
  92         webpage = self._download_webpage(f'https://www.openrec.tv/capture/{video_id}', video_id)
  93
  94         window_stores = self._extract_pagestore(webpage, video_id)
  95         movie_store = window_stores.get('movie')
  96
  97         capture_data = window_stores.get('capture')
  98         if not capture_data:
  99             raise ExtractorError('Cannot extract title')
 100
 101         formats = self._extract_m3u8_formats(
 102             capture_data.get('source'), video_id, ext='mp4')
 103         self._sort_formats(formats)
 104
 105         return {
 106             'id': video_id,
 107             'title': capture_data.get('title'),
 108             'thumbnail': capture_data.get('thumbnailUrl'),
 109             'formats': formats,
 110             'timestamp': unified_timestamp(traverse_obj(movie_store, 'createdAt', expected_type=compat_str)),
 111             'uploader': traverse_obj(movie_store, ('channel', 'name'), expected_type=compat_str),
 112             'uploader_id': traverse_obj(movie_store, ('channel', 'id'), expected_type=compat_str),
 113             'upload_date': unified_strdate(capture_data.get('createdAt')),
 114         }
 115
 116
 117 class OpenRecMovieIE(OpenRecBaseIE):
 118     IE_NAME = 'openrec:movie'
 119     _VALID_URL = r'https?://(?:www\.)?openrec\.tv/movie/(?P<id>[^/]+)'
 120     _TESTS = [{
 121         'url': 'https://www.openrec.tv/movie/nqz5xl5km8v',
 122         'info_dict': {
 123             'id': 'nqz5xl5km8v',
 124             'title': '限定コミュニティ(Discord)参加方法ご説明動画',
 125             'description': 'md5:ebd563e5f5b060cda2f02bf26b14d87f',
 126             'thumbnail': r're:https://.+',
 127             'uploader': 'タイキとカズヒロ',
 128             'uploader_id': 'taiki_to_kazuhiro',
 129             'timestamp': 1638856800,
 130         },
 131     }]
 132
 133     def _real_extract(self, url):
 134         video_id = self._match_id(url)
 135         webpage = self._download_webpage(f'https://www.openrec.tv/movie/{video_id}', video_id)
 136
 137         return self._extract_movie(webpage, video_id, 'movie', False)