[yt-dlp.git] / yt_dlp / extractor / jstream.py

import base64
import re
import json

from .common import InfoExtractor
from ..utils import (
    float_or_none,
    js_to_json,
    remove_start,
)


class JStreamIE(InfoExtractor):
    # group "id" only exists for compliance, not directly used in requests
    # also all components are mandatory
    _VALID_URL = r'jstream:(?P<host>www\d+):(?P<id>(?P<publisher>[a-z0-9]+):(?P<mid>\d+))'

    _TESTS = [{
        'url': 'jstream:www50:eqd638pvwx:752',
        'info_dict': {
            'id': 'eqd638pvwx:752',
            'ext': 'mp4',
            'title': '阪神淡路大震災 激震の記録2020年版　解説動画',
            'duration': 672,
            'thumbnail': r're:https?://eqd638pvwx\.eq\.webcdn\.stream\.ne\.jp/.+\.jpg',
        },
    }]

    def _parse_jsonp(self, callback, string, video_id):
        return self._search_json(rf'\s*{re.escape(callback)}\s*\(', string, callback, video_id)

    def _find_formats(self, video_id, movie_list_hls, host, publisher, subtitles):
        for value in movie_list_hls:
            text = value.get('text') or ''
            if not text.startswith('auto'):
                continue
            m3u8_id = remove_start(remove_start(text, 'auto'), '_') or None
            fmts, subs = self._extract_m3u8_formats_and_subtitles(
                f'https://{publisher}.eq.webcdn.stream.ne.jp/{host}/{publisher}/jmc_pub/{value.get("url")}', video_id, 'mp4', m3u8_id=m3u8_id)
            self._merge_subtitles(subs, target=subtitles)
            yield from fmts

    def _real_extract(self, url):
        host, publisher, mid, video_id = self._match_valid_url(url).group('host', 'publisher', 'mid', 'id')
        video_info_jsonp = self._download_webpage(
            f'https://{publisher}.eq.webcdn.stream.ne.jp/{host}/{publisher}/jmc_pub/eq_meta/v1/{mid}.jsonp',
            video_id, 'Requesting video info')
        video_info = self._parse_jsonp('metaDataResult', video_info_jsonp, video_id)['movie']
        subtitles = {}
        formats = list(self._find_formats(video_id, video_info.get('movie_list_hls'), host, publisher, subtitles))
        self._remove_duplicate_formats(formats)
        return {
            'id': video_id,
            'title': video_info.get('title'),
            'duration': float_or_none(video_info.get('duration')),
            'thumbnail': video_info.get('thumbnail_url'),
            'formats': formats,
            'subtitles': subtitles,
        }

    @classmethod
    def _extract_embed_urls(cls, url, webpage):
        # check for eligiblity of webpage
        # https://support.eq.stream.co.jp/hc/ja/articles/115008388147-%E3%83%97%E3%83%AC%E3%82%A4%E3%83%A4%E3%83%BCAPI%E3%81%AE%E3%82%B5%E3%83%B3%E3%83%97%E3%83%AB%E3%82%B3%E3%83%BC%E3%83%89
        script_tag = re.search(r'<script\s*[^>]+?src="https://ssl-cache\.stream\.ne\.jp/(?P<host>www\d+)/(?P<publisher>[a-z0-9]+)/[^"]+?/if\.js"', webpage)
        if not script_tag:
            return
        host, publisher = script_tag.groups()
        for m in re.finditer(r'(?s)PlayerFactoryIF\.create\(\s*({[^\}]+?})\s*\)\s*;', webpage):
            # TODO: using json.loads here as InfoExtractor._parse_json is not classmethod
            info = json.loads(js_to_json(m.group(1)))
            mid = base64.b64decode(info.get('m')).decode()
            yield f'jstream:{host}:{publisher}:{mid}'
Commit	Line	Data
3459d3c5 L	1	import base64
	2	import re
	3	import json
	4
	5	from .common import InfoExtractor
	6	from ..utils import (
	7	float_or_none,
	8	js_to_json,
	9	remove_start,
	10	)
	11
	12
	13	class JStreamIE(InfoExtractor):
	14	# group "id" only exists for compliance, not directly used in requests
	15	# also all components are mandatory
	16	_VALID_URL = r'jstream:(?P<host>www\d+):(?P<id>(?P<publisher>[a-z0-9]+):(?P<mid>\d+))'
	17
	18	_TESTS = [{
	19	'url': 'jstream:www50:eqd638pvwx:752',
	20	'info_dict': {
	21	'id': 'eqd638pvwx:752',
	22	'ext': 'mp4',
	23	'title': '阪神淡路大震災激震の記録2020年版　解説動画',
	24	'duration': 672,
	25	'thumbnail': r're:https?://eqd638pvwx\.eq\.webcdn\.stream\.ne\.jp/.+\.jpg',
	26	},
	27	}]
	28
	29	def _parse_jsonp(self, callback, string, video_id):
	30	return self._search_json(rf'\s{re.escape(callback)}\s\(', string, callback, video_id)
	31
	32	def _find_formats(self, video_id, movie_list_hls, host, publisher, subtitles):
	33	for value in movie_list_hls:
	34	text = value.get('text') or ''
	35	if not text.startswith('auto'):
	36	continue
	37	m3u8_id = remove_start(remove_start(text, 'auto'), '_') or None
	38	fmts, subs = self._extract_m3u8_formats_and_subtitles(
	39	f'https://{publisher}.eq.webcdn.stream.ne.jp/{host}/{publisher}/jmc_pub/{value.get("url")}', video_id, 'mp4', m3u8_id=m3u8_id)
	40	self._merge_subtitles(subs, target=subtitles)
	41	yield from fmts
	42
	43	def _real_extract(self, url):
	44	host, publisher, mid, video_id = self._match_valid_url(url).group('host', 'publisher', 'mid', 'id')
	45	video_info_jsonp = self._download_webpage(
	46	f'https://{publisher}.eq.webcdn.stream.ne.jp/{host}/{publisher}/jmc_pub/eq_meta/v1/{mid}.jsonp',
	47	video_id, 'Requesting video info')
	48	video_info = self._parse_jsonp('metaDataResult', video_info_jsonp, video_id)['movie']
	49	subtitles = {}
	50	formats = list(self._find_formats(video_id, video_info.get('movie_list_hls'), host, publisher, subtitles))
	51	self._remove_duplicate_formats(formats)
	52	return {
	53	'id': video_id,
	54	'title': video_info.get('title'),
	55	'duration': float_or_none(video_info.get('duration')),
	56	'thumbnail': video_info.get('thumbnail_url'),
	57	'formats': formats,
	58	'subtitles': subtitles,
	59	}
	60
	61	@classmethod
	62	def _extract_embed_urls(cls, url, webpage):
	63	# check for eligiblity of webpage
	64	# https://support.eq.stream.co.jp/hc/ja/articles/115008388147-%E3%83%97%E3%83%AC%E3%82%A4%E3%83%A4%E3%83%BCAPI%E3%81%AE%E3%82%B5%E3%83%B3%E3%83%97%E3%83%AB%E3%82%B3%E3%83%BC%E3%83%89
65	script_tag = re.search(r'<script\s*[^>]+?src="https://ssl-cache\.stream\.ne\.jp/(?P<host>www\d+)/(?P<publisher>[a-z0-9]+)/[^"]+?/if\.js"', webpage)
66	if not script_tag:
67	return
68	host, publisher = script_tag.groups()
69	for m in re.finditer(r'(?s)PlayerFactoryIF\.create\(\s({[^\}]+?})\s\)\s*;', webpage):
70	# TODO: using json.loads here as InfoExtractor._parse_json is not classmethod
71	info = json.loads(js_to_json(m.group(1)))
72	mid = base64.b64decode(info.get('m')).decode()
73	yield f'jstream:{host}:{publisher}:{mid}'