[yt-dlp.git] / yt_dlp / extractor / japandiet.py

import re

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
    clean_html,
    int_or_none,
    join_nonempty,
    parse_qs,
    smuggle_url,
    traverse_obj,
    try_call,
    unsmuggle_url,
)


def _parse_japanese_date(text):
    if not text:
        return None
    ERA_TABLE = {
        '明治': 1868,
        '大正': 1912,
        '昭和': 1926,
        '平成': 1989,
        '令和': 2019,
    }
    ERA_RE = '|'.join(map(re.escape, ERA_TABLE.keys()))
    mobj = re.search(rf'({ERA_RE})?(\d+)年(\d+)月(\d+)日', re.sub(r'[\s\u3000]+', '', text))
    if not mobj:
        return None
    era, year, month, day = mobj.groups()
    year, month, day = map(int, (year, month, day))
    if era:
        # example input: 令和5年3月34日
        # even though each era have their end, don't check here
        year += ERA_TABLE[era]
    return '%04d%02d%02d' % (year, month, day)


def _parse_japanese_duration(text):
    mobj = re.search(r'(?:(\d+)日間?)?(?:(\d+)時間?)?(?:(\d+)分)?(?:(\d+)秒)?', re.sub(r'[\s\u3000]+', '', text or ''))
    if not mobj:
        return
    days, hours, mins, secs = [int_or_none(x, default=0) for x in mobj.groups()]
    return secs + mins * 60 + hours * 60 * 60 + days * 24 * 60 * 60


class ShugiinItvBaseIE(InfoExtractor):
    _INDEX_ROOMS = None

    @classmethod
    def _find_rooms(cls, webpage):
        return [{
            '_type': 'url',
            'id': x.group(1),
            'title': clean_html(x.group(2)).strip(),
            'url': smuggle_url(f'https://www.shugiintv.go.jp/jp/index.php?room_id={x.group(1)}', {'g': x.groups()}),
            'ie_key': ShugiinItvLiveIE.ie_key(),
        } for x in re.finditer(r'(?s)<a\s+href="[^"]+\?room_id=(room\d+)"\s*class="play_live".+?class="s12_14">(.+?)</td>', webpage)]

    def _fetch_rooms(self):
        if not self._INDEX_ROOMS:
            webpage = self._download_webpage(
                'https://www.shugiintv.go.jp/jp/index.php', None,
                encoding='euc-jp', note='Downloading proceedings info')
            ShugiinItvBaseIE._INDEX_ROOMS = self._find_rooms(webpage)
        return self._INDEX_ROOMS


class ShugiinItvLiveIE(ShugiinItvBaseIE):
    _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)(?:/index\.php)?$'
    IE_DESC = '衆議院インターネット審議中継'

    _TESTS = [{
        'url': 'https://www.shugiintv.go.jp/jp/index.php',
        'info_dict': {
            '_type': 'playlist',
            'title': 'All proceedings for today',
        },
        # expect at least one proceedings is running
        'playlist_mincount': 1,
    }]

    @classmethod
    def suitable(cls, url):
        return super().suitable(url) and not any(x.suitable(url) for x in (ShugiinItvLiveRoomIE, ShugiinItvVodIE))

    def _real_extract(self, url):
        self.to_screen(
            'Downloading all running proceedings. To specify one proceeding, use direct link from the website')
        return self.playlist_result(self._fetch_rooms(), playlist_title='All proceedings for today')


class ShugiinItvLiveRoomIE(ShugiinItvBaseIE):
    _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?room_id=(?P<id>room\d+)'
    IE_DESC = '衆議院インターネット審議中継 (中継)'

    _TESTS = [{
        'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room01',
        'info_dict': {
            'id': 'room01',
            'title': '内閣委員会',
        },
        'skip': 'this runs for a time and not every day',
    }, {
        'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room11',
        'info_dict': {
            'id': 'room11',
            'title': '外務委員会',
        },
        'skip': 'this runs for a time and not every day',
    }]

    def _real_extract(self, url):
        url, smug = unsmuggle_url(url, default={})
        if smug.get('g'):
            room_id, title = smug['g']
        else:
            room_id = self._match_id(url)
            title = traverse_obj(self._fetch_rooms(), (lambda k, v: v['id'] == room_id, 'title'), get_all=False)

        formats, subtitles = self._extract_m3u8_formats_and_subtitles(
            f'https://hlslive.shugiintv.go.jp/{room_id}/amlst:{room_id}/playlist.m3u8',
            room_id, ext='mp4')

        return {
            'id': room_id,
            'title': title,
            'formats': formats,
            'subtitles': subtitles,
            'is_live': True,
        }


class ShugiinItvVodIE(ShugiinItvBaseIE):
    _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?ex=VL(?:\&[^=]+=[^&]*)*\&deli_id=(?P<id>\d+)'
    IE_DESC = '衆議院インターネット審議中継 (ビデオライブラリ)'
    _TESTS = [{
        'url': 'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id=53846',
        'info_dict': {
            'id': '53846',
            'title': 'ウクライナ大統領国会演説（オンライン）',
            'release_date': '20220323',
            'chapters': 'count:4',
        }
    }, {
        'url': 'https://www.shugiintv.go.jp/en/index.php?ex=VL&media_type=&deli_id=53846',
        'only_matching': True
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(
            f'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id={video_id}', video_id,
            encoding='euc-jp')

        m3u8_url = self._search_regex(
            r'id="vtag_src_base_vod"\s*value="(http.+?\.m3u8)"', webpage, 'm3u8 url')
        m3u8_url = re.sub(r'^http://', 'https://', m3u8_url)
        formats, subtitles = self._extract_m3u8_formats_and_subtitles(
            m3u8_url, video_id, ext='mp4')

        title = self._html_search_regex(
            (r'<td\s+align="left">(.+)\s*\(\d+分\)',
             r'<TD.+?<IMG\s*src=".+?/spacer\.gif".+?height="15">(.+?)<IMG'), webpage, 'title', fatal=False)

        release_date = _parse_japanese_date(self._html_search_regex(
            r'開会日</td>\s*<td.+?/td>\s*<TD>(.+?)</TD>',
            webpage, 'title', fatal=False))

        chapters = []
        for chp in re.finditer(r'(?i)<A\s+HREF="([^"]+?)"\s*class="play_vod">(?!<img)(.+)</[Aa]>', webpage):
            chapters.append({
                'title': clean_html(chp.group(2)).strip(),
                'start_time': try_call(lambda: float(parse_qs(chp.group(1))['time'][0].strip())),
            })
        # NOTE: there are blanks at the first and the end of the videos,
        # so getting/providing the video duration is not possible
        # also, the exact end_time for the last chapter is unknown (we can get at most minutes of granularity)
        last_tr = re.findall(r'(?s)<TR\s*class="s14_24">(.+?)</TR>', webpage)[-1]
        if last_tr and chapters:
            last_td = re.findall(r'<TD.+?</TD>', last_tr)[-1]
            if last_td:
                chapters[-1]['end_time'] = chapters[-1]['start_time'] + _parse_japanese_duration(clean_html(last_td))

        return {
            'id': video_id,
            'title': title,
            'release_date': release_date,
            'chapters': chapters,
            'formats': formats,
            'subtitles': subtitles,
        }


class SangiinInstructionIE(InfoExtractor):
    _VALID_URL = r'^https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php'
    IE_DESC = False  # this shouldn't be listed as a supported site

    def _real_extract(self, url):
        raise ExtractorError('Copy the link from the botton below the video description or player, and use the link to download. If there are no button in the frame, get the URL of the frame showing the video.', expected=True)


class SangiinIE(InfoExtractor):
    _VALID_URL = r'https?://www\.webtv\.sangiin\.go\.jp/webtv/detail\.php\?sid=(?P<id>\d+)'
    IE_DESC = '参議院インターネット審議中継 (archive)'

    _TESTS = [{
        'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7052',
        'info_dict': {
            'id': '7052',
            'title': '2022年10月7日 本会議',
            'description': 'md5:0a5fed523f95c88105a0b0bf1dd71489',
            'upload_date': '20221007',
            'ext': 'mp4',
        },
    }, {
        'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7037',
        'info_dict': {
            'id': '7037',
            'title': '2022年10月3日 開会式',
            'upload_date': '20221003',
            'ext': 'mp4',
        },
    }, {
        'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7076',
        'info_dict': {
            'id': '7076',
            'title': '2022年10月27日 法務委員会',
            'upload_date': '20221027',
            'ext': 'mp4',
            'is_live': True,
        },
        'skip': 'this live is turned into archive after it ends',
    }, ]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)

        date = self._html_search_regex(
            r'<dt[^>]*>\s*開会日\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage,
            'date', fatal=False)
        upload_date = _parse_japanese_date(date)

        title = self._html_search_regex(
            r'<dt[^>]*>\s*会議名\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage,
            'date', fatal=False)

        # some videos don't have the elements, so assume it's missing
        description = self._html_search_regex(
            r'会議の経過\s*</h3>\s*<span[^>]*>(.+?)</span>', webpage,
            'description', default=None)

        # this row appears only when it's livestream
        is_live = bool(self._html_search_regex(
            r'<dt[^>]*>\s*公報掲載時刻\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage,
            'is_live', default=None))

        m3u8_url = self._search_regex(
            r'var\s+videopath\s*=\s*(["\'])([^"\']+)\1', webpage,
            'm3u8 url', group=2)

        formats, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4')

        return {
            'id': video_id,
            'title': join_nonempty(date, title, delim=' '),
            'description': description,
            'upload_date': upload_date,
            'formats': formats,
            'subtitles': subs,
            'is_live': is_live,
        }
Commit	Line	Data
682b4524 L	1	import re
682b4524 L	2
e897bd82	3	from .common import InfoExtractor
682b4524 L	4	from ..utils import (
	5	ExtractorError,
	6	clean_html,
	7	int_or_none,
	8	join_nonempty,
	9	parse_qs,
	10	smuggle_url,
	11	traverse_obj,
	12	try_call,
e897bd82	13	unsmuggle_url,
682b4524	14	)
682b4524 L	15
	16
	17	def _parse_japanese_date(text):
	18	if not text:
	19	return None
	20	ERA_TABLE = {
	21	'明治': 1868,
	22	'大正': 1912,
	23	'昭和': 1926,
	24	'平成': 1989,
	25	'令和': 2019,
	26	}
	27	ERA_RE = '\|'.join(map(re.escape, ERA_TABLE.keys()))
	28	mobj = re.search(rf'({ERA_RE})?(\d+)年(\d+)月(\d+)日', re.sub(r'[\s\u3000]+', '', text))
	29	if not mobj:
	30	return None
	31	era, year, month, day = mobj.groups()
	32	year, month, day = map(int, (year, month, day))
	33	if era:
	34	# example input: 令和5年3月34日
	35	# even though each era have their end, don't check here
	36	year += ERA_TABLE[era]
	37	return '%04d%02d%02d' % (year, month, day)
	38
	39
	40	def _parse_japanese_duration(text):
	41	mobj = re.search(r'(?:(\d+)日間?)?(?:(\d+)時間?)?(?:(\d+)分)?(?:(\d+)秒)?', re.sub(r'[\s\u3000]+', '', text or ''))
	42	if not mobj:
	43	return
	44	days, hours, mins, secs = [int_or_none(x, default=0) for x in mobj.groups()]
	45	return secs + mins * 60 + hours * 60 * 60 + days * 24 * 60 * 60
	46
	47
	48	class ShugiinItvBaseIE(InfoExtractor):
	49	_INDEX_ROOMS = None
	50
	51	@classmethod
	52	def _find_rooms(cls, webpage):
	53	return [{
	54	'_type': 'url',
	55	'id': x.group(1),
	56	'title': clean_html(x.group(2)).strip(),
	57	'url': smuggle_url(f'https://www.shugiintv.go.jp/jp/index.php?room_id={x.group(1)}', {'g': x.groups()}),
	58	'ie_key': ShugiinItvLiveIE.ie_key(),
	59	} for x in re.finditer(r'(?s)<a\s+href="[^"]+\?room_id=(room\d+)"\s*class="play_live".+?class="s12_14">(.+?)</td>', webpage)]
	60
	61	def _fetch_rooms(self):
	62	if not self._INDEX_ROOMS:
	63	webpage = self._download_webpage(
	64	'https://www.shugiintv.go.jp/jp/index.php', None,
	65	encoding='euc-jp', note='Downloading proceedings info')
	66	ShugiinItvBaseIE._INDEX_ROOMS = self._find_rooms(webpage)
	67	return self._INDEX_ROOMS
	68
	69
	70	class ShugiinItvLiveIE(ShugiinItvBaseIE):
	71	_VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp\|en)(?:/index\.php)?$'
	72	IE_DESC = '衆議院インターネット審議中継'
	73
	74	_TESTS = [{
	75	'url': 'https://www.shugiintv.go.jp/jp/index.php',
	76	'info_dict': {
	77	'_type': 'playlist',
	78	'title': 'All proceedings for today',
79	},
80	# expect at least one proceedings is running
81	'playlist_mincount': 1,
82	}]
83
84	@classmethod
85	def suitable(cls, url):
86	return super().suitable(url) and not any(x.suitable(url) for x in (ShugiinItvLiveRoomIE, ShugiinItvVodIE))
87
88	def _real_extract(self, url):
89	self.to_screen(
90	'Downloading all running proceedings. To specify one proceeding, use direct link from the website')
91	return self.playlist_result(self._fetch_rooms(), playlist_title='All proceedings for today')
92
93
94	class ShugiinItvLiveRoomIE(ShugiinItvBaseIE):
95	_VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp\|en)/index\.php\?room_id=(?P<id>room\d+)'
96	IE_DESC = '衆議院インターネット審議中継 (中継)'
97
98	_TESTS = [{
99	'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room01',
100	'info_dict': {
101	'id': 'room01',
102	'title': '内閣委員会',
103	},
104	'skip': 'this runs for a time and not every day',
105	}, {
106	'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room11',
107	'info_dict': {
108	'id': 'room11',
109	'title': '外務委員会',
110	},
111	'skip': 'this runs for a time and not every day',
112	}]
113
114	def _real_extract(self, url):
115	url, smug = unsmuggle_url(url, default={})
116	if smug.get('g'):
117	room_id, title = smug['g']
118	else:
119	room_id = self._match_id(url)
120	title = traverse_obj(self._fetch_rooms(), (lambda k, v: v['id'] == room_id, 'title'), get_all=False)
121
122	formats, subtitles = self._extract_m3u8_formats_and_subtitles(
123	f'https://hlslive.shugiintv.go.jp/{room_id}/amlst:{room_id}/playlist.m3u8',
124	room_id, ext='mp4')
682b4524 L	125
	126	return {
	127	'id': room_id,
	128	'title': title,
	129	'formats': formats,
	130	'subtitles': subtitles,
	131	'is_live': True,
	132	}
	133
	134
	135	class ShugiinItvVodIE(ShugiinItvBaseIE):
	136	_VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp\|en)/index\.php\?ex=VL(?:\&[^=]+=[^&])\&deli_id=(?P<id>\d+)'
	137	IE_DESC = '衆議院インターネット審議中継 (ビデオライブラリ)'
	138	_TESTS = [{
	139	'url': 'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id=53846',
	140	'info_dict': {
	141	'id': '53846',
	142	'title': 'ウクライナ大統領国会演説（オンライン）',
	143	'release_date': '20220323',
	144	'chapters': 'count:4',
	145	}
	146	}, {
	147	'url': 'https://www.shugiintv.go.jp/en/index.php?ex=VL&media_type=&deli_id=53846',
	148	'only_matching': True
	149	}]
	150
	151	def _real_extract(self, url):
	152	video_id = self._match_id(url)
	153	webpage = self._download_webpage(
	154	f'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id={video_id}', video_id,
	155	encoding='euc-jp')
	156
	157	m3u8_url = self._search_regex(
	158	r'id="vtag_src_base_vod"\s*value="(http.+?\.m3u8)"', webpage, 'm3u8 url')
	159	m3u8_url = re.sub(r'^http://', 'https://', m3u8_url)
	160	formats, subtitles = self._extract_m3u8_formats_and_subtitles(
	161	m3u8_url, video_id, ext='mp4')
682b4524 L	162
	163	title = self._html_search_regex(
	164	(r'<td\s+align="left">(.+)\s*\(\d+分\)',
	165	r'<TD.+?<IMG\s*src=".+?/spacer\.gif".+?height="15">(.+?)<IMG'), webpage, 'title', fatal=False)
	166
	167	release_date = _parse_japanese_date(self._html_search_regex(
	168	r'開会日</td>\s<td.+?/td>\s<TD>(.+?)</TD>',
	169	webpage, 'title', fatal=False))
	170
	171	chapters = []
	172	for chp in re.finditer(r'(?i)<A\s+HREF="([^"]+?)"\s*class="play_vod">(?!<img)(.+)</[Aa]>', webpage):
	173	chapters.append({
	174	'title': clean_html(chp.group(2)).strip(),
	175	'start_time': try_call(lambda: float(parse_qs(chp.group(1))['time'][0].strip())),
	176	})
	177	# NOTE: there are blanks at the first and the end of the videos,
	178	# so getting/providing the video duration is not possible
	179	# also, the exact end_time for the last chapter is unknown (we can get at most minutes of granularity)
	180	last_tr = re.findall(r'(?s)<TR\s*class="s14_24">(.+?)</TR>', webpage)[-1]
	181	if last_tr and chapters:
	182	last_td = re.findall(r'<TD.+?</TD>', last_tr)[-1]
	183	if last_td:
	184	chapters[-1]['end_time'] = chapters[-1]['start_time'] + _parse_japanese_duration(clean_html(last_td))
	185
	186	return {
	187	'id': video_id,
	188	'title': title,
	189	'release_date': release_date,
	190	'chapters': chapters,
	191	'formats': formats,
	192	'subtitles': subtitles,
	193	}
	194
	195
	196	class SangiinInstructionIE(InfoExtractor):
	197	_VALID_URL = r'^https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php'
	198	IE_DESC = False # this shouldn't be listed as a supported site
	199
	200	def _real_extract(self, url):
	201	raise ExtractorError('Copy the link from the botton below the video description or player, and use the link to download. If there are no button in the frame, get the URL of the frame showing the video.', expected=True)
	202
	203
	204	class SangiinIE(InfoExtractor):
	205	_VALID_URL = r'https?://www\.webtv\.sangiin\.go\.jp/webtv/detail\.php\?sid=(?P<id>\d+)'
	206	IE_DESC = '参議院インターネット審議中継 (archive)'
	207
	208	_TESTS = [{
	209	'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7052',
	210	'info_dict': {
	211	'id': '7052',
	212	'title': '2022年10月7日本会議',
	213	'description': 'md5:0a5fed523f95c88105a0b0bf1dd71489',
	214	'upload_date': '20221007',
	215	'ext': 'mp4',
	216	},
	217	}, {
	218	'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7037',
	219	'info_dict': {
	220	'id': '7037',
	221	'title': '2022年10月3日開会式',
	222	'upload_date': '20221003',
	223	'ext': 'mp4',
	224	},
	225	}, {
226	'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7076',
227	'info_dict': {
228	'id': '7076',
229	'title': '2022年10月27日法務委員会',
230	'upload_date': '20221027',
231	'ext': 'mp4',
232	'is_live': True,
233	},
234	'skip': 'this live is turned into archive after it ends',
235	}, ]
236
237	def _real_extract(self, url):
238	video_id = self._match_id(url)
239	webpage = self._download_webpage(url, video_id)
240
241	date = self._html_search_regex(
242	r'<dt[^>]>\s開会日\s</dt>\s<dd[^>]>\s(.+?)\s*</dd>', webpage,
243	'date', fatal=False)
244	upload_date = _parse_japanese_date(date)
245
246	title = self._html_search_regex(
247	r'<dt[^>]>\s会議名\s</dt>\s<dd[^>]>\s(.+?)\s*</dd>', webpage,
248	'date', fatal=False)
249
250	# some videos don't have the elements, so assume it's missing
251	description = self._html_search_regex(
252	r'会議の経過\s</h3>\s<span[^>]*>(.+?)</span>', webpage,
253	'description', default=None)
254
255	# this row appears only when it's livestream
256	is_live = bool(self._html_search_regex(
257	r'<dt[^>]>\s公報掲載時刻\s</dt>\s<dd[^>]>\s(.+?)\s*</dd>', webpage,
258	'is_live', default=None))
259
260	m3u8_url = self._search_regex(
261	r'var\s+videopath\s=\s(["\'])([^"\']+)\1', webpage,
262	'm3u8 url', group=2)
263
264	formats, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4')
682b4524 L	265
	266	return {
	267	'id': video_id,
	268	'title': join_nonempty(date, title, delim=' '),
	269	'description': description,
	270	'upload_date': upload_date,
	271	'formats': formats,
	272	'subtitles': subs,
	273	'is_live': is_live,
	274	}