]>
Commit | Line | Data |
---|---|---|
1 | import itertools | |
2 | ||
3 | from .common import InfoExtractor | |
4 | from ..utils import ( | |
5 | clean_html, | |
6 | extract_attributes, | |
7 | get_element_by_class, | |
8 | get_element_html_by_class, | |
9 | get_elements_html_by_class, | |
10 | parse_qs, | |
11 | traverse_obj, | |
12 | unified_strdate, | |
13 | urljoin | |
14 | ) | |
15 | ||
16 | ||
17 | class TheGuardianPodcastIE(InfoExtractor): | |
18 | _VALID_URL = r'https?://(?:www\.)?theguardian\.com/\w+/audio/\d{4}/\w{3}/\d{1,2}/(?P<id>[\w-]+)' | |
19 | _TESTS = [{ | |
20 | 'url': 'https://www.theguardian.com/news/audio/2023/nov/03/we-are-just-getting-started-the-plastic-eating-bacteria-that-could-change-the-world-podcast', | |
21 | 'md5': 'd1771744681789b4cd7da2a08e487702', | |
22 | 'info_dict': { | |
23 | 'id': 'we-are-just-getting-started-the-plastic-eating-bacteria-that-could-change-the-world-podcast', | |
24 | 'ext': 'mp3', | |
25 | 'title': '‘We are just getting started’: the plastic-eating bacteria that could change the world – podcast', | |
26 | 'description': 'md5:cfd3df2791d394d2ab62cd571d5207ee', | |
27 | 'creator': 'Stephen Buranyi', | |
28 | 'thumbnail': 'md5:73c12558fcb3b0e2a59422bfb33b3f79', | |
29 | 'release_date': '20231103' | |
30 | } | |
31 | }, { | |
32 | 'url': 'https://www.theguardian.com/news/audio/2023/oct/30/the-trials-of-robert-habeck-is-the-worlds-most-powerful-green-politician-doomed-to-fail-podcast', | |
33 | 'md5': 'd1771744681789b4cd7da2a08e487702', | |
34 | 'info_dict': { | |
35 | 'id': 'the-trials-of-robert-habeck-is-the-worlds-most-powerful-green-politician-doomed-to-fail-podcast', | |
36 | 'ext': 'mp3', | |
37 | 'title': 'The trials of Robert Habeck: is the world’s most powerful green politician doomed to fail? – podcast', | |
38 | 'description': 'md5:1b5cf6582d1771c6b7077784b5456994', | |
39 | 'creator': 'Philip Oltermann', | |
40 | 'thumbnail': 'md5:6e5c5ec43843e956e20be793722e9080', | |
41 | 'release_date': '20231030' | |
42 | } | |
43 | }, { | |
44 | 'url': 'https://www.theguardian.com/football/audio/2023/nov/06/arsenal-feel-hard-done-by-and-luton-hold-liverpool-football-weekly', | |
45 | 'md5': 'a2fcff6f8e060a95b1483295273dc35e', | |
46 | 'info_dict': { | |
47 | 'id': 'arsenal-feel-hard-done-by-and-luton-hold-liverpool-football-weekly', | |
48 | 'ext': 'mp3', | |
49 | 'title': 'Arsenal feel hard done by and Luton hold Liverpool – Football Weekly', | |
50 | 'description': 'md5:286a9fbddaeb7c83cc65d1c4a5330b2a', | |
51 | 'creator': 'Max Rushden', | |
52 | 'thumbnail': 'md5:93eb7d6440f1bb94eb3a6cad63f48afd', | |
53 | 'release_date': '20231106' | |
54 | } | |
55 | }, { | |
56 | 'url': 'https://www.theguardian.com/politics/audio/2023/nov/02/the-covid-inquiry-politics-weekly-uk-podcast', | |
57 | 'md5': '06a0f7e9701a80c8064a5d35690481ec', | |
58 | 'info_dict': { | |
59 | 'id': 'the-covid-inquiry-politics-weekly-uk-podcast', | |
60 | 'ext': 'mp3', | |
61 | 'title': 'The Covid inquiry | Politics Weekly UK - podcast', | |
62 | 'description': 'md5:207c98859c14903582b17d25b014046e', | |
63 | 'creator': 'Gaby Hinsliff', | |
64 | 'thumbnail': 'md5:28932a7b5a25b057be330d2ed70ea7f3', | |
65 | 'release_date': '20231102' | |
66 | } | |
67 | }] | |
68 | ||
69 | def _real_extract(self, url): | |
70 | video_id = self._match_id(url) | |
71 | webpage = self._download_webpage(url, video_id) | |
72 | return { | |
73 | 'id': video_id, | |
74 | 'title': self._og_search_title(webpage) or get_element_by_class('content__headline', webpage), | |
75 | 'description': self._og_search_description(webpage), | |
76 | 'creator': self._html_search_meta('author', webpage), | |
77 | 'thumbnail': self._og_search_thumbnail(webpage), | |
78 | 'release_date': unified_strdate(self._html_search_meta('article:published_time', webpage)), | |
79 | 'url': extract_attributes(get_element_html_by_class( | |
80 | 'podcast__player', webpage) or '').get('data-source'), | |
81 | } | |
82 | ||
83 | ||
84 | class TheGuardianPodcastPlaylistIE(InfoExtractor): | |
85 | _VALID_URL = r'https?://(?:www\.)?theguardian\.com/\w+/series/(?P<id>[\w-]+)(?:\?page=\d+)?' | |
86 | _TESTS = [{ | |
87 | 'url': 'https://www.theguardian.com/football/series/theguardianswomensfootballweekly', | |
88 | 'info_dict': { | |
89 | 'id': 'theguardianswomensfootballweekly', | |
90 | 'title': "The Guardian's Women's Football Weekly", | |
91 | 'description': 'md5:e2cc021311e582d29935a73614a43f51' | |
92 | }, | |
93 | 'playlist_mincount': 69 | |
94 | }, { | |
95 | 'url': 'https://www.theguardian.com/news/series/todayinfocus?page=2', | |
96 | 'info_dict': { | |
97 | 'id': 'todayinfocus', | |
98 | 'title': 'Today in Focus', | |
99 | 'description': 'md5:0f097764fc0d359e0b6eb537be0387e2' | |
100 | }, | |
101 | 'playlist_mincount': 1261 | |
102 | }, { | |
103 | 'url': 'https://www.theguardian.com/news/series/the-audio-long-read', | |
104 | 'info_dict': { | |
105 | 'id': 'the-audio-long-read', | |
106 | 'title': 'The Audio Long Read', | |
107 | 'description': 'md5:5462994a27527309562b25b6defc4ef3' | |
108 | }, | |
109 | 'playlist_mincount': 996 | |
110 | }] | |
111 | ||
112 | def _entries(self, url, playlist_id): | |
113 | for page in itertools.count(1): | |
114 | webpage, urlh = self._download_webpage_handle( | |
115 | url, playlist_id, f'Downloading page {page}', query={'page': page}) | |
116 | if 'page' not in parse_qs(urlh.url): | |
117 | break | |
118 | ||
119 | episodes = get_elements_html_by_class('fc-item--type-media', webpage) | |
120 | for url_path in traverse_obj(episodes, (..., {extract_attributes}, 'data-id')): | |
121 | yield url_path | |
122 | ||
123 | def _real_extract(self, url): | |
124 | podcast_id = self._match_id(url) | |
125 | ||
126 | webpage = self._download_webpage(url, podcast_id) | |
127 | ||
128 | title = clean_html(get_element_by_class( | |
129 | 'index-page-header__title', webpage) or get_element_by_class('flagship-audio__title', webpage)) | |
130 | description = self._og_search_description(webpage) or self._html_search_meta( | |
131 | 'description', webpage) | |
132 | ||
133 | return self.playlist_from_matches( | |
134 | self._entries(url, podcast_id), podcast_id, title, description=description, | |
135 | ie=TheGuardianPodcastIE, getter=lambda x: urljoin('https://www.theguardian.com', x)) |