]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/radiocomercial.py
[cleanup] Fix infodict returned fields (#8906)
[yt-dlp.git] / yt_dlp / extractor / radiocomercial.py
1 import itertools
2
3 from .common import InfoExtractor
4 from ..networking.exceptions import HTTPError
5 from ..utils import (
6 ExtractorError,
7 extract_attributes,
8 get_element_by_class,
9 get_element_html_by_class,
10 get_element_text_and_html_by_tag,
11 get_elements_html_by_class,
12 int_or_none,
13 join_nonempty,
14 try_call,
15 unified_strdate,
16 update_url,
17 urljoin
18 )
19 from ..utils.traversal import traverse_obj
20
21
22 class RadioComercialIE(InfoExtractor):
23 _VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/[^/?#]+/t?(?P<season>\d+)/(?P<id>[\w-]+)'
24 _TESTS = [{
25 'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao/t6/taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas#page-content-wrapper',
26 'md5': '5f4fe8e485b29d2e8fd495605bc2c7e4',
27 'info_dict': {
28 'id': 'taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas',
29 'ext': 'mp3',
30 'title': 'Taylor Swift entranhando-se que nem uma espada no ventre dos fãs.',
31 'release_date': '20231025',
32 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
33 'season': 'Season 6',
34 'season_number': 6,
35 }
36 }, {
37 'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3/convenca-me-num-minuto-que-os-lobisomens-existem',
38 'md5': '47e96c273aef96a8eb160cd6cf46d782',
39 'info_dict': {
40 'id': 'convenca-me-num-minuto-que-os-lobisomens-existem',
41 'ext': 'mp3',
42 'title': 'Convença-me num minuto que os lobisomens existem',
43 'release_date': '20231026',
44 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
45 'season': 'Season 3',
46 'season_number': 3,
47 }
48 }, {
49 'url': 'https://radiocomercial.pt/podcasts/inacreditavel-by-ines-castel-branco/t2/o-desastre-de-aviao',
50 'md5': '69be64255420fec23b7259955d771e54',
51 'info_dict': {
52 'id': 'o-desastre-de-aviao',
53 'ext': 'mp3',
54 'title': 'O desastre de avião',
55 'description': 'md5:8a82beeb372641614772baab7246245f',
56 'release_date': '20231101',
57 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
58 'season': 'Season 2',
59 'season_number': 2,
60 },
61 'params': {
62 # inconsistant md5
63 'skip_download': True,
64 },
65 }, {
66 'url': 'https://radiocomercial.pt/podcasts/tnt-todos-no-top/2023/t-n-t-29-de-outubro',
67 'md5': '91d32d4d4b1407272068b102730fc9fa',
68 'info_dict': {
69 'id': 't-n-t-29-de-outubro',
70 'ext': 'mp3',
71 'title': 'T.N.T 29 de outubro',
72 'release_date': '20231029',
73 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
74 'season': 'Season 2023',
75 'season_number': 2023,
76 }
77 }]
78
79 def _real_extract(self, url):
80 video_id, season = self._match_valid_url(url).group('id', 'season')
81 webpage = self._download_webpage(url, video_id)
82 return {
83 'id': video_id,
84 'title': self._html_extract_title(webpage),
85 'description': self._og_search_description(webpage, default=None),
86 'release_date': unified_strdate(get_element_by_class(
87 'date', get_element_html_by_class('descriptions', webpage) or '')),
88 'thumbnail': self._og_search_thumbnail(webpage),
89 'season_number': int_or_none(season),
90 'url': extract_attributes(get_element_html_by_class('audiofile', webpage) or '').get('href'),
91 }
92
93
94 class RadioComercialPlaylistIE(InfoExtractor):
95 _VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/(?P<id>[\w-]+)(?:/t?(?P<season>\d+))?/?(?:$|[?#])'
96 _TESTS = [{
97 'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3',
98 'info_dict': {
99 'id': 'convenca-me-num-minuto_t3',
100 'title': 'Convença-me num Minuto - Temporada 3',
101 },
102 'playlist_mincount': 32
103 }, {
104 'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao',
105 'info_dict': {
106 'id': 'o-homem-que-mordeu-o-cao',
107 'title': 'O Homem Que Mordeu o Cão',
108 },
109 'playlist_mincount': 19
110 }, {
111 'url': 'https://radiocomercial.pt/podcasts/as-minhas-coisas-favoritas',
112 'info_dict': {
113 'id': 'as-minhas-coisas-favoritas',
114 'title': 'As Minhas Coisas Favoritas',
115 },
116 'playlist_mincount': 131
117 }, {
118 'url': 'https://radiocomercial.pt/podcasts/tnt-todos-no-top/t2023',
119 'info_dict': {
120 'id': 'tnt-todos-no-top_t2023',
121 'title': 'TNT - Todos No Top - Temporada 2023',
122 },
123 'playlist_mincount': 39
124 }]
125
126 def _entries(self, url, playlist_id):
127 for page in itertools.count(1):
128 try:
129 webpage = self._download_webpage(
130 f'{url}/{page}', playlist_id, f'Downloading page {page}')
131 except ExtractorError as e:
132 if isinstance(e.cause, HTTPError) and e.cause.status == 404:
133 break
134 raise
135
136 episodes = get_elements_html_by_class('tm-ouvir-podcast', webpage)
137 if not episodes:
138 break
139 for url_path in traverse_obj(episodes, (..., {extract_attributes}, 'href')):
140 episode_url = urljoin(url, url_path)
141 if RadioComercialIE.suitable(episode_url):
142 yield episode_url
143
144 def _real_extract(self, url):
145 podcast, season = self._match_valid_url(url).group('id', 'season')
146 playlist_id = join_nonempty(podcast, season, delim='_t')
147 url = update_url(url, query=None, fragment=None)
148 webpage = self._download_webpage(url, playlist_id)
149
150 name = try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0])
151 title = name if name == season else join_nonempty(name, season, delim=' - Temporada ')
152
153 return self.playlist_from_matches(
154 self._entries(url, playlist_id), playlist_id, title, ie=RadioComercialIE)