]>
jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/japandiet.py
14 from . common
import InfoExtractor
17 def _parse_japanese_date ( text
):
27 ERA_RE
= '|' . join ( map ( re
. escape
, ERA_TABLE
. keys ()))
28 mobj
= re
. search ( rf
'( {ERA_RE} )?(\d+)年(\d+)月(\d+)日' , re
. sub ( r
'[\s\u3000]+' , '' , text
))
31 era
, year
, month
, day
= mobj
. groups ()
32 year
, month
, day
= map ( int , ( year
, month
, day
))
34 # example input: 令和5年3月34日
35 # even though each era have their end, don't check here
36 year
+= ERA_TABLE
[ era
]
37 return ' %0 4d %0 2d %0 2d' % ( year
, month
, day
)
40 def _parse_japanese_duration ( text
):
41 mobj
= re
. search ( r
'(?:(\d+)日間?)?(?:(\d+)時間?)?(?:(\d+)分)?(?:(\d+)秒)?' , re
. sub ( r
'[\s\u3000]+' , '' , text
or '' ))
44 days
, hours
, mins
, secs
= [ int_or_none ( x
, default
= 0 ) for x
in mobj
. groups ()]
45 return secs
+ mins
* 60 + hours
* 60 * 60 + days
* 24 * 60 * 60
48 class ShugiinItvBaseIE ( InfoExtractor
):
52 def _find_rooms ( cls
, webpage
):
56 'title' : clean_html ( x
. group ( 2 )). strip (),
57 'url' : smuggle_url ( f
'https://www.shugiintv.go.jp/jp/index.php?room_id= {x.group(1)} ' , {'g': x.groups()}
),
58 'ie_key' : ShugiinItvLiveIE
. ie_key (),
59 } for x
in re
. finditer ( r
'(?s)<a\s+href="[^"]+\?room_id=(room\d+)"\s*class="play_live".+?class="s12_14">(.+?)</td>' , webpage
)]
61 def _fetch_rooms ( self
):
62 if not self
._ INDEX
_ ROOMS
:
63 webpage
= self
._ download
_ webpage
(
64 'https://www.shugiintv.go.jp/jp/index.php' , None ,
65 encoding
= 'euc-jp' , note
= 'Downloading proceedings info' )
66 ShugiinItvBaseIE
._ INDEX
_ ROOMS
= self
._ find
_ rooms
( webpage
)
67 return self
._ INDEX
_ ROOMS
70 class ShugiinItvLiveIE ( ShugiinItvBaseIE
):
71 _VALID_URL
= r
'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)(?:/index\.php)?$'
72 IE_DESC
= '衆議院インターネット審議中継'
75 'url' : 'https://www.shugiintv.go.jp/jp/index.php' ,
78 'title' : 'All proceedings for today' ,
80 # expect at least one proceedings is running
81 'playlist_mincount' : 1 ,
85 def suitable ( cls
, url
):
86 return super (). suitable ( url
) and not any ( x
. suitable ( url
) for x
in ( ShugiinItvLiveRoomIE
, ShugiinItvVodIE
))
88 def _real_extract ( self
, url
):
90 'Downloading all running proceedings. To specify one proceeding, use direct link from the website' )
91 return self
. playlist_result ( self
._ fetch
_ rooms
(), playlist_title
= 'All proceedings for today' )
94 class ShugiinItvLiveRoomIE ( ShugiinItvBaseIE
):
95 _VALID_URL
= r
'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?room_id=(?P<id>room\d+)'
96 IE_DESC
= '衆議院インターネット審議中継 (中継)'
99 'url' : 'https://www.shugiintv.go.jp/jp/index.php?room_id=room01' ,
104 'skip' : 'this runs for a time and not every day' ,
106 'url' : 'https://www.shugiintv.go.jp/jp/index.php?room_id=room11' ,
111 'skip' : 'this runs for a time and not every day' ,
114 def _real_extract ( self
, url
):
115 url
, smug
= unsmuggle_url ( url
, default
={})
117 room_id
, title
= smug
[ 'g' ]
119 room_id
= self
._ match
_ id
( url
)
120 title
= traverse_obj ( self
._ fetch
_ rooms
(), ( lambda k
, v
: v
[ 'id' ] == room_id
, 'title' ), get_all
= False )
122 formats
, subtitles
= self
._ extract
_ m
3u8_ formats
_ and
_ subtitles
(
123 f
'https://hlslive.shugiintv.go.jp/ {room_id} /amlst: {room_id} /playlist.m3u8' ,
130 'subtitles' : subtitles
,
135 class ShugiinItvVodIE ( ShugiinItvBaseIE
):
136 _VALID_URL
= r
'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?ex=VL(?:\&[^=]+=[^&]*)*\&deli_id=(?P<id>\d+)'
137 IE_DESC
= '衆議院インターネット審議中継 (ビデオライブラリ)'
139 'url' : 'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id=53846' ,
142 'title' : 'ウクライナ大統領国会演説(オンライン)' ,
143 'release_date' : '20220323' ,
144 'chapters' : 'count:4' ,
147 'url' : 'https://www.shugiintv.go.jp/en/index.php?ex=VL&media_type=&deli_id=53846' ,
148 'only_matching' : True
151 def _real_extract ( self
, url
):
152 video_id
= self
._ match
_ id
( url
)
153 webpage
= self
._ download
_ webpage
(
154 f
'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id= {video_id} ' , video_id
,
157 m3u8_url
= self
._ search
_ regex
(
158 r
'id="vtag_src_base_vod"\s*value="(http.+?\.m3u8)"' , webpage
, 'm3u8 url' )
159 m3u8_url
= re
. sub ( r
'^http://' , 'https://' , m3u8_url
)
160 formats
, subtitles
= self
._ extract
_ m
3u8_ formats
_ and
_ subtitles
(
161 m3u8_url
, video_id
, ext
= 'mp4' )
163 title
= self
._ html
_ search
_ regex
(
164 ( r
'<td\s+align="left">(.+)\s*\(\d+分\)' ,
165 r
'<TD.+?<IMG\s*src=".+?/spacer\.gif".+?height="15">(.+?)<IMG' ), webpage
, 'title' , fatal
= False )
167 release_date
= _parse_japanese_date ( self
._ html
_ search
_ regex
(
168 r
'開会日</td>\s*<td.+?/td>\s*<TD>(.+?)</TD>' ,
169 webpage
, 'title' , fatal
= False ))
172 for chp
in re
. finditer ( r
'(?i)<A\s+HREF="([^"]+?)"\s*class="play_vod">(?!<img)(.+)</[Aa]>' , webpage
):
174 'title' : clean_html ( chp
. group ( 2 )). strip (),
175 'start_time' : try_call ( lambda : float ( parse_qs ( chp
. group ( 1 ))[ 'time' ][ 0 ]. strip ())),
177 # NOTE: there are blanks at the first and the end of the videos,
178 # so getting/providing the video duration is not possible
179 # also, the exact end_time for the last chapter is unknown (we can get at most minutes of granularity)
180 last_tr
= re
. findall ( r
'(?s)<TR\s*class="s14_24">(.+?)</TR>' , webpage
)[- 1 ]
181 if last_tr
and chapters
:
182 last_td
= re
. findall ( r
'<TD.+?</TD>' , last_tr
)[- 1 ]
184 chapters
[- 1 ][ 'end_time' ] = chapters
[- 1 ][ 'start_time' ] + _parse_japanese_duration ( clean_html ( last_td
))
189 'release_date' : release_date
,
190 'chapters' : chapters
,
192 'subtitles' : subtitles
,
196 class SangiinInstructionIE ( InfoExtractor
):
197 _VALID_URL
= r
'^https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php'
198 IE_DESC
= False # this shouldn't be listed as a supported site
200 def _real_extract ( self
, url
):
201 raise ExtractorError ( 'Copy the link from the botton below the video description or player, and use the link to download. If there are no button in the frame, get the URL of the frame showing the video.' , expected
= True )
204 class SangiinIE ( InfoExtractor
):
205 _VALID_URL
= r
'https?://www\.webtv\.sangiin\.go\.jp/webtv/detail\.php\?sid=(?P<id>\d+)'
206 IE_DESC
= '参議院インターネット審議中継 (archive)'
209 'url' : 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7052' ,
212 'title' : '2022年10月7日 本会議' ,
213 'description' : 'md5:0a5fed523f95c88105a0b0bf1dd71489' ,
214 'upload_date' : '20221007' ,
218 'url' : 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7037' ,
221 'title' : '2022年10月3日 開会式' ,
222 'upload_date' : '20221003' ,
226 'url' : 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7076' ,
229 'title' : '2022年10月27日 法務委員会' ,
230 'upload_date' : '20221027' ,
234 'skip' : 'this live is turned into archive after it ends' ,
237 def _real_extract ( self
, url
):
238 video_id
= self
._ match
_ id
( url
)
239 webpage
= self
._ download
_ webpage
( url
, video_id
)
241 date
= self
._ html
_ search
_ regex
(
242 r
'<dt[^>]*>\s*開会日\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>' , webpage
,
244 upload_date
= _parse_japanese_date ( date
)
246 title
= self
._ html
_ search
_ regex
(
247 r
'<dt[^>]*>\s*会議名\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>' , webpage
,
250 # some videos don't have the elements, so assume it's missing
251 description
= self
._ html
_ search
_ regex
(
252 r
'会議の経過\s*</h3>\s*<span[^>]*>(.+?)</span>' , webpage
,
253 'description' , default
= None )
255 # this row appears only when it's livestream
256 is_live
= bool ( self
._ html
_ search
_ regex
(
257 r
'<dt[^>]*>\s*公報掲載時刻\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>' , webpage
,
258 'is_live' , default
= None ))
260 m3u8_url
= self
._ search
_ regex
(
261 r
'var\s+videopath\s*=\s*(["\' ])([ ^
" \' ]+)\1', webpage,
264 formats, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4')
268 'title': join_nonempty(date, title, delim=' '),
269 'description': description,
270 'upload_date': upload_date,