]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/japandiet.py
[misc] Add `hatch`, `ruff`, `pre-commit` and improve dev docs (#7409)
[yt-dlp.git] / yt_dlp / extractor / japandiet.py
CommitLineData
682b4524
L
1import re
2
e897bd82 3from .common import InfoExtractor
682b4524
L
4from ..utils import (
5 ExtractorError,
6 clean_html,
7 int_or_none,
8 join_nonempty,
9 parse_qs,
10 smuggle_url,
11 traverse_obj,
12 try_call,
e897bd82 13 unsmuggle_url,
682b4524 14)
682b4524
L
15
16
17def _parse_japanese_date(text):
18 if not text:
19 return None
20 ERA_TABLE = {
21 '明治': 1868,
22 '大正': 1912,
23 '昭和': 1926,
24 '平成': 1989,
25 '令和': 2019,
26 }
27 ERA_RE = '|'.join(map(re.escape, ERA_TABLE.keys()))
28 mobj = re.search(rf'({ERA_RE})?(\d+)年(\d+)月(\d+)日', re.sub(r'[\s\u3000]+', '', text))
29 if not mobj:
30 return None
31 era, year, month, day = mobj.groups()
32 year, month, day = map(int, (year, month, day))
33 if era:
34 # example input: 令和5年3月34日
35 # even though each era have their end, don't check here
36 year += ERA_TABLE[era]
37 return '%04d%02d%02d' % (year, month, day)
38
39
40def _parse_japanese_duration(text):
41 mobj = re.search(r'(?:(\d+)日間?)?(?:(\d+)時間?)?(?:(\d+)分)?(?:(\d+)秒)?', re.sub(r'[\s\u3000]+', '', text or ''))
42 if not mobj:
43 return
44 days, hours, mins, secs = [int_or_none(x, default=0) for x in mobj.groups()]
45 return secs + mins * 60 + hours * 60 * 60 + days * 24 * 60 * 60
46
47
48class ShugiinItvBaseIE(InfoExtractor):
49 _INDEX_ROOMS = None
50
51 @classmethod
52 def _find_rooms(cls, webpage):
53 return [{
54 '_type': 'url',
55 'id': x.group(1),
56 'title': clean_html(x.group(2)).strip(),
57 'url': smuggle_url(f'https://www.shugiintv.go.jp/jp/index.php?room_id={x.group(1)}', {'g': x.groups()}),
58 'ie_key': ShugiinItvLiveIE.ie_key(),
59 } for x in re.finditer(r'(?s)<a\s+href="[^"]+\?room_id=(room\d+)"\s*class="play_live".+?class="s12_14">(.+?)</td>', webpage)]
60
61 def _fetch_rooms(self):
62 if not self._INDEX_ROOMS:
63 webpage = self._download_webpage(
64 'https://www.shugiintv.go.jp/jp/index.php', None,
65 encoding='euc-jp', note='Downloading proceedings info')
66 ShugiinItvBaseIE._INDEX_ROOMS = self._find_rooms(webpage)
67 return self._INDEX_ROOMS
68
69
70class ShugiinItvLiveIE(ShugiinItvBaseIE):
71 _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)(?:/index\.php)?$'
72 IE_DESC = '衆議院インターネット審議中継'
73
74 _TESTS = [{
75 'url': 'https://www.shugiintv.go.jp/jp/index.php',
76 'info_dict': {
77 '_type': 'playlist',
78 'title': 'All proceedings for today',
79 },
80 # expect at least one proceedings is running
81 'playlist_mincount': 1,
82 }]
83
84 @classmethod
85 def suitable(cls, url):
86 return super().suitable(url) and not any(x.suitable(url) for x in (ShugiinItvLiveRoomIE, ShugiinItvVodIE))
87
88 def _real_extract(self, url):
89 self.to_screen(
90 'Downloading all running proceedings. To specify one proceeding, use direct link from the website')
91 return self.playlist_result(self._fetch_rooms(), playlist_title='All proceedings for today')
92
93
94class ShugiinItvLiveRoomIE(ShugiinItvBaseIE):
95 _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?room_id=(?P<id>room\d+)'
96 IE_DESC = '衆議院インターネット審議中継 (中継)'
97
98 _TESTS = [{
99 'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room01',
100 'info_dict': {
101 'id': 'room01',
102 'title': '内閣委員会',
103 },
104 'skip': 'this runs for a time and not every day',
105 }, {
106 'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room11',
107 'info_dict': {
108 'id': 'room11',
109 'title': '外務委員会',
110 },
111 'skip': 'this runs for a time and not every day',
112 }]
113
114 def _real_extract(self, url):
115 url, smug = unsmuggle_url(url, default={})
116 if smug.get('g'):
117 room_id, title = smug['g']
118 else:
119 room_id = self._match_id(url)
120 title = traverse_obj(self._fetch_rooms(), (lambda k, v: v['id'] == room_id, 'title'), get_all=False)
121
122 formats, subtitles = self._extract_m3u8_formats_and_subtitles(
123 f'https://hlslive.shugiintv.go.jp/{room_id}/amlst:{room_id}/playlist.m3u8',
124 room_id, ext='mp4')
682b4524
L
125
126 return {
127 'id': room_id,
128 'title': title,
129 'formats': formats,
130 'subtitles': subtitles,
131 'is_live': True,
132 }
133
134
135class ShugiinItvVodIE(ShugiinItvBaseIE):
136 _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?ex=VL(?:\&[^=]+=[^&]*)*\&deli_id=(?P<id>\d+)'
137 IE_DESC = '衆議院インターネット審議中継 (ビデオライブラリ)'
138 _TESTS = [{
139 'url': 'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id=53846',
140 'info_dict': {
141 'id': '53846',
142 'title': 'ウクライナ大統領国会演説(オンライン)',
143 'release_date': '20220323',
144 'chapters': 'count:4',
145 }
146 }, {
147 'url': 'https://www.shugiintv.go.jp/en/index.php?ex=VL&media_type=&deli_id=53846',
148 'only_matching': True
149 }]
150
151 def _real_extract(self, url):
152 video_id = self._match_id(url)
153 webpage = self._download_webpage(
154 f'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id={video_id}', video_id,
155 encoding='euc-jp')
156
157 m3u8_url = self._search_regex(
158 r'id="vtag_src_base_vod"\s*value="(http.+?\.m3u8)"', webpage, 'm3u8 url')
159 m3u8_url = re.sub(r'^http://', 'https://', m3u8_url)
160 formats, subtitles = self._extract_m3u8_formats_and_subtitles(
161 m3u8_url, video_id, ext='mp4')
682b4524
L
162
163 title = self._html_search_regex(
164 (r'<td\s+align="left">(.+)\s*\(\d+分\)',
165 r'<TD.+?<IMG\s*src=".+?/spacer\.gif".+?height="15">(.+?)<IMG'), webpage, 'title', fatal=False)
166
167 release_date = _parse_japanese_date(self._html_search_regex(
168 r'開会日</td>\s*<td.+?/td>\s*<TD>(.+?)</TD>',
169 webpage, 'title', fatal=False))
170
171 chapters = []
172 for chp in re.finditer(r'(?i)<A\s+HREF="([^"]+?)"\s*class="play_vod">(?!<img)(.+)</[Aa]>', webpage):
173 chapters.append({
174 'title': clean_html(chp.group(2)).strip(),
175 'start_time': try_call(lambda: float(parse_qs(chp.group(1))['time'][0].strip())),
176 })
177 # NOTE: there are blanks at the first and the end of the videos,
178 # so getting/providing the video duration is not possible
179 # also, the exact end_time for the last chapter is unknown (we can get at most minutes of granularity)
180 last_tr = re.findall(r'(?s)<TR\s*class="s14_24">(.+?)</TR>', webpage)[-1]
181 if last_tr and chapters:
182 last_td = re.findall(r'<TD.+?</TD>', last_tr)[-1]
183 if last_td:
184 chapters[-1]['end_time'] = chapters[-1]['start_time'] + _parse_japanese_duration(clean_html(last_td))
185
186 return {
187 'id': video_id,
188 'title': title,
189 'release_date': release_date,
190 'chapters': chapters,
191 'formats': formats,
192 'subtitles': subtitles,
193 }
194
195
196class SangiinInstructionIE(InfoExtractor):
197 _VALID_URL = r'^https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php'
198 IE_DESC = False # this shouldn't be listed as a supported site
199
200 def _real_extract(self, url):
201 raise ExtractorError('Copy the link from the botton below the video description or player, and use the link to download. If there are no button in the frame, get the URL of the frame showing the video.', expected=True)
202
203
204class SangiinIE(InfoExtractor):
205 _VALID_URL = r'https?://www\.webtv\.sangiin\.go\.jp/webtv/detail\.php\?sid=(?P<id>\d+)'
206 IE_DESC = '参議院インターネット審議中継 (archive)'
207
208 _TESTS = [{
209 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7052',
210 'info_dict': {
211 'id': '7052',
212 'title': '2022年10月7日 本会議',
213 'description': 'md5:0a5fed523f95c88105a0b0bf1dd71489',
214 'upload_date': '20221007',
215 'ext': 'mp4',
216 },
217 }, {
218 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7037',
219 'info_dict': {
220 'id': '7037',
221 'title': '2022年10月3日 開会式',
222 'upload_date': '20221003',
223 'ext': 'mp4',
224 },
225 }, {
226 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7076',
227 'info_dict': {
228 'id': '7076',
229 'title': '2022年10月27日 法務委員会',
230 'upload_date': '20221027',
231 'ext': 'mp4',
232 'is_live': True,
233 },
234 'skip': 'this live is turned into archive after it ends',
235 }, ]
236
237 def _real_extract(self, url):
238 video_id = self._match_id(url)
239 webpage = self._download_webpage(url, video_id)
240
241 date = self._html_search_regex(
242 r'<dt[^>]*>\s*開会日\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage,
243 'date', fatal=False)
244 upload_date = _parse_japanese_date(date)
245
246 title = self._html_search_regex(
247 r'<dt[^>]*>\s*会議名\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage,
248 'date', fatal=False)
249
250 # some videos don't have the elements, so assume it's missing
251 description = self._html_search_regex(
252 r'会議の経過\s*</h3>\s*<span[^>]*>(.+?)</span>', webpage,
253 'description', default=None)
254
255 # this row appears only when it's livestream
256 is_live = bool(self._html_search_regex(
257 r'<dt[^>]*>\s*公報掲載時刻\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage,
258 'is_live', default=None))
259
260 m3u8_url = self._search_regex(
261 r'var\s+videopath\s*=\s*(["\'])([^"\']+)\1', webpage,
262 'm3u8 url', group=2)
263
264 formats, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4')
682b4524
L
265
266 return {
267 'id': video_id,
268 'title': join_nonempty(date, title, delim=' '),
269 'description': description,
270 'upload_date': upload_date,
271 'formats': formats,
272 'subtitles': subs,
273 'is_live': is_live,
274 }