]>
jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/washingtonpost.py
2 from __future__
import unicode_literals
6 from . common
import InfoExtractor
8 from .. utils
import traverse_obj
11 class WashingtonPostIE ( InfoExtractor
):
12 IE_NAME
= 'washingtonpost'
13 _VALID_URL
= r
'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/(?:video|posttv)/(?:[^/]+/)*)(?P<id>[\da-f] {8} -[\da-f] {4} -[\da-f] {4} -[\da-f] {4} -[\da-f] {12} )'
14 _EMBED_URL
= r
'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f] {8} -[\da-f] {4} -[\da-f] {4} -[\da-f] {4} -[\da-f] {12} '
16 'url' : 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d' ,
17 'md5' : '6f537e1334b714eb15f9563bd4b9cdfa' ,
19 'id' : '480ba4ee-1ec7-11e6-82c2-a7dcb313287d' ,
21 'title' : 'Egypt finds belongings, debris from plane crash' ,
22 'description' : 'md5:a17ceee432f215a5371388c1f680bd86' ,
23 'upload_date' : '20160520' ,
24 'timestamp' : 1463775187 ,
27 'url' : 'https://www.washingtonpost.com/video/world/egypt-finds-belongings-debris-from-plane-crash/2016/05/20/480ba4ee-1ec7-11e6-82c2-a7dcb313287d_video.html' ,
28 'only_matching' : True ,
30 'url' : 'https://www.washingtonpost.com/posttv/world/iraq-to-track-down-antiquities-after-islamic-state-museum-rampage/2015/02/28/7c57e916-bf86-11e4-9dfb-03366e719af8_video.html' ,
31 'only_matching' : True ,
35 def _extract_urls ( cls
, webpage
):
37 r
'<iframe[^>]+\bsrc=["\' ]( %s) ' % cls._EMBED_URL, webpage)
39 def _real_extract(self, url):
40 video_id = self._match_id(url)
41 return self.url_result(
42 ' arcpublishing
: wapo
: ' + video_id, ' ArcPublishing
', video_id)
45 class WashingtonPostArticleIE(InfoExtractor):
46 IE_NAME = ' washingtonpost
: article
'
47 _VALID_URL = r' https?
://( ?
: www\
.) ?washingtonpost\
. com
/( ?
:[ ^
/]+/)*( ?P
< id >[ ^
/ ?
#]+)'
49 'url' : 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/' ,
51 'id' : 'sinkhole-of-bureaucracy' ,
52 'title' : 'Sinkhole of bureaucracy' ,
55 'md5' : '7ccf53ea8cbb77de5f570242b3b21a59' ,
57 'id' : 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f' ,
59 'title' : 'Breaking Points: The Paper Mine' ,
61 'description' : 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.' ,
62 'timestamp' : 1395440416 ,
63 'upload_date' : '20140321' ,
64 'thumbnail' : r
're:https://[^\.]+.cloudfront\.net/PAPERMINESplash\.jpg' ,
67 'md5' : '7ccf53ea8cbb77de5f570242b3b21a59' ,
69 'id' : '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f' ,
71 'title' : 'The town bureaucracy sustains' ,
72 'description' : 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it \' s like to do paperwork 230 feet underground.' ,
74 'timestamp' : 1395441819 ,
75 'upload_date' : '20140321' ,
76 'thumbnail' : r
're:https://[^\.]+.cloudfront\.net/BoyersSplash\.jpeg' ,
80 'url' : 'http://www.washingtonpost.com/blogs/wonkblog/wp/2014/12/31/one-airline-figured-out-how-to-make-sure-its-airplanes-never-disappear/' ,
82 'id' : 'one-airline-figured-out-how-to-make-sure-its-airplanes-never-disappear' ,
83 'title' : 'One airline figured out how to make sure its airplanes never disappear' ,
86 'md5' : 'a7c1b5634ba5e57a6a82cdffa5b1e0d0' ,
88 'id' : '0e4bb54c-9065-11e4-a66f-0ca5037a597d' ,
90 'description' : 'Washington Post transportation reporter Ashley Halsey III explains why a plane \' s black box needs to be recovered from a crash site instead of having its information streamed in real time throughout the flight.' ,
91 'upload_date' : '20141230' ,
92 'timestamp' : 1419972442 ,
93 'title' : 'Why black boxes don’t transmit data in real time' ,
96 'skip' : 'Doesnt have a video anymore' ,
98 'url' : 'https://www.washingtonpost.com/nation/2021/08/05/dixie-river-fire-california-climate/' ,
99 'only_matching' : True ,
103 def suitable ( cls
, url
):
104 return False if WashingtonPostIE
. suitable ( url
) else super ( WashingtonPostArticleIE
, cls
). suitable ( url
)
106 def _real_extract ( self
, url
):
107 page_id
= self
._ match
_ id
( url
)
108 webpage
= self
._ download
_ webpage
( url
, page_id
)
110 title
= self
._ og
_ search
_ title
( webpage
)
112 uuids
= re
. findall ( r
'''(?x)
114 <div\s+class="posttv-video-embed[^>]*?data-uuid=|
116 )"([^"]+)"''' , webpage
)
119 json_data
= self
._ search
_ nextjs
_ data
( webpage
, page_id
)
120 for content_element
in traverse_obj ( json_data
, ( 'props' , 'pageProps' , 'globalContent' , 'content_elements' )):
121 if content_element
. get ( 'type' ) == 'video' :
122 uuids
. append ( content_element
. get ( '_id' ))
124 entries
= [ self
. url_result ( 'washingtonpost: %s ' % uuid
, 'WashingtonPost' , uuid
) for uuid
in uuids
]