]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/washingtonpost.py
[ie/crunchyroll] Fix stream extraction (#10005)
[yt-dlp.git] / yt_dlp / extractor / washingtonpost.py
CommitLineData
fac55558
PH
1import re
2
3from .common import InfoExtractor
fac55558 4
5b804e39
B
5from ..utils import traverse_obj
6
fac55558
PH
7
8class WashingtonPostIE(InfoExtractor):
4b464a6a 9 IE_NAME = 'washingtonpost'
29f7c58a 10 _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/(?:video|posttv)/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
bfd973ec 11 _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})']
29f7c58a 12 _TESTS = [{
4b464a6a 13 'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
14 'md5': '6f537e1334b714eb15f9563bd4b9cdfa',
15 'info_dict': {
16 'id': '480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
17 'ext': 'mp4',
18 'title': 'Egypt finds belongings, debris from plane crash',
19 'description': 'md5:a17ceee432f215a5371388c1f680bd86',
20 'upload_date': '20160520',
29f7c58a 21 'timestamp': 1463775187,
4b464a6a 22 },
29f7c58a 23 }, {
24 'url': 'https://www.washingtonpost.com/video/world/egypt-finds-belongings-debris-from-plane-crash/2016/05/20/480ba4ee-1ec7-11e6-82c2-a7dcb313287d_video.html',
25 'only_matching': True,
26 }, {
27 'url': 'https://www.washingtonpost.com/posttv/world/iraq-to-track-down-antiquities-after-islamic-state-museum-rampage/2015/02/28/7c57e916-bf86-11e4-9dfb-03366e719af8_video.html',
28 'only_matching': True,
29 }]
4b464a6a 30
31 def _real_extract(self, url):
32 video_id = self._match_id(url)
29f7c58a 33 return self.url_result(
34 'arcpublishing:wapo:' + video_id, 'ArcPublishing', video_id)
4b464a6a 35
36
37class WashingtonPostArticleIE(InfoExtractor):
38 IE_NAME = 'washingtonpost:article'
39 _VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'
01c46659 40 _TESTS = [{
fac55558 41 'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/',
8029857d 42 'info_dict': {
01c46659 43 'id': 'sinkhole-of-bureaucracy',
8029857d
S
44 'title': 'Sinkhole of bureaucracy',
45 },
fac55558 46 'playlist': [{
5b804e39 47 'md5': '7ccf53ea8cbb77de5f570242b3b21a59',
fac55558
PH
48 'info_dict': {
49 'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f',
50 'ext': 'mp4',
51 'title': 'Breaking Points: The Paper Mine',
10723362 52 'duration': 1290,
fac55558 53 'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.',
29f7c58a 54 'timestamp': 1395440416,
55 'upload_date': '20140321',
5b804e39 56 'thumbnail': r're:https://[^\.]+.cloudfront\.net/PAPERMINESplash\.jpg',
fac55558
PH
57 },
58 }, {
5b804e39 59 'md5': '7ccf53ea8cbb77de5f570242b3b21a59',
fac55558
PH
60 'info_dict': {
61 'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f',
62 'ext': 'mp4',
63 'title': 'The town bureaucracy sustains',
64 'description': 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it\'s like to do paperwork 230 feet underground.',
10723362 65 'duration': 2220,
29f7c58a 66 'timestamp': 1395441819,
67 'upload_date': '20140321',
5b804e39 68 'thumbnail': r're:https://[^\.]+.cloudfront\.net/BoyersSplash\.jpeg',
fac55558 69 },
01c46659
PH
70 }],
71 }, {
72 'url': 'http://www.washingtonpost.com/blogs/wonkblog/wp/2014/12/31/one-airline-figured-out-how-to-make-sure-its-airplanes-never-disappear/',
73 'info_dict': {
74 'id': 'one-airline-figured-out-how-to-make-sure-its-airplanes-never-disappear',
75 'title': 'One airline figured out how to make sure its airplanes never disappear',
76 },
77 'playlist': [{
78 'md5': 'a7c1b5634ba5e57a6a82cdffa5b1e0d0',
79 'info_dict': {
80 'id': '0e4bb54c-9065-11e4-a66f-0ca5037a597d',
81 'ext': 'mp4',
82 'description': 'Washington Post transportation reporter Ashley Halsey III explains why a plane\'s black box needs to be recovered from a crash site instead of having its information streamed in real time throughout the flight.',
83 'upload_date': '20141230',
29f7c58a 84 'timestamp': 1419972442,
01c46659
PH
85 'title': 'Why black boxes don’t transmit data in real time',
86 }
5b804e39
B
87 }],
88 'skip': 'Doesnt have a video anymore',
89 }, {
90 'url': 'https://www.washingtonpost.com/nation/2021/08/05/dixie-river-fire-california-climate/',
91 'only_matching': True,
01c46659 92 }]
fac55558 93
4b464a6a 94 @classmethod
95 def suitable(cls, url):
96 return False if WashingtonPostIE.suitable(url) else super(WashingtonPostArticleIE, cls).suitable(url)
97
fac55558 98 def _real_extract(self, url):
99673f04 99 page_id = self._match_id(url)
fac55558 100 webpage = self._download_webpage(url, page_id)
99673f04 101
fac55558 102 title = self._og_search_title(webpage)
01c46659
PH
103
104 uuids = re.findall(r'''(?x)
105 (?:
106 <div\s+class="posttv-video-embed[^>]*?data-uuid=|
107 data-video-uuid=
108 )"([^"]+)"''', webpage)
5b804e39
B
109
110 if not uuids:
111 json_data = self._search_nextjs_data(webpage, page_id)
112 for content_element in traverse_obj(json_data, ('props', 'pageProps', 'globalContent', 'content_elements')):
113 if content_element.get('type') == 'video':
114 uuids.append(content_element.get('_id'))
115
4b464a6a 116 entries = [self.url_result('washingtonpost:%s' % uuid, 'WashingtonPost', uuid) for uuid in uuids]
fac55558
PH
117
118 return {
119 '_type': 'playlist',
120 'entries': entries,
121 'id': page_id,
122 'title': title,
123 }