]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/radiocomercial.py
[ie/roosterteeth] Extract release date and timestamp (#9393)
[yt-dlp.git] / yt_dlp / extractor / radiocomercial.py
CommitLineData
ef12dbdc
S
1import itertools
2
3from .common import InfoExtractor
4from ..networking.exceptions import HTTPError
5from ..utils import (
6 ExtractorError,
7 extract_attributes,
8 get_element_by_class,
9 get_element_html_by_class,
10 get_element_text_and_html_by_tag,
11 get_elements_html_by_class,
12 int_or_none,
13 join_nonempty,
14 try_call,
15 unified_strdate,
16 update_url,
17 urljoin
18)
19from ..utils.traversal import traverse_obj
20
21
22class RadioComercialIE(InfoExtractor):
23 _VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/[^/?#]+/t?(?P<season>\d+)/(?P<id>[\w-]+)'
24 _TESTS = [{
25 'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao/t6/taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas#page-content-wrapper',
26 'md5': '5f4fe8e485b29d2e8fd495605bc2c7e4',
27 'info_dict': {
28 'id': 'taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas',
29 'ext': 'mp3',
30 'title': 'Taylor Swift entranhando-se que nem uma espada no ventre dos fãs.',
31 'release_date': '20231025',
32 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
33 'season': 6
34 }
35 }, {
36 'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3/convenca-me-num-minuto-que-os-lobisomens-existem',
37 'md5': '47e96c273aef96a8eb160cd6cf46d782',
38 'info_dict': {
39 'id': 'convenca-me-num-minuto-que-os-lobisomens-existem',
40 'ext': 'mp3',
41 'title': 'Convença-me num minuto que os lobisomens existem',
42 'release_date': '20231026',
43 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
44 'season': 3
45 }
46 }, {
47 'url': 'https://radiocomercial.pt/podcasts/inacreditavel-by-ines-castel-branco/t2/o-desastre-de-aviao',
48 'md5': '69be64255420fec23b7259955d771e54',
49 'info_dict': {
50 'id': 'o-desastre-de-aviao',
51 'ext': 'mp3',
52 'title': 'O desastre de avião',
53 'description': 'md5:8a82beeb372641614772baab7246245f',
54 'release_date': '20231101',
55 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
56 'season': 2
57 },
58 'params': {
59 # inconsistant md5
60 'skip_download': True,
61 },
62 }, {
63 'url': 'https://radiocomercial.pt/podcasts/tnt-todos-no-top/2023/t-n-t-29-de-outubro',
64 'md5': '91d32d4d4b1407272068b102730fc9fa',
65 'info_dict': {
66 'id': 't-n-t-29-de-outubro',
67 'ext': 'mp3',
68 'title': 'T.N.T 29 de outubro',
69 'release_date': '20231029',
70 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
71 'season': 2023
72 }
73 }]
74
75 def _real_extract(self, url):
76 video_id, season = self._match_valid_url(url).group('id', 'season')
77 webpage = self._download_webpage(url, video_id)
78 return {
79 'id': video_id,
80 'title': self._html_extract_title(webpage),
81 'description': self._og_search_description(webpage, default=None),
82 'release_date': unified_strdate(get_element_by_class(
83 'date', get_element_html_by_class('descriptions', webpage) or '')),
84 'thumbnail': self._og_search_thumbnail(webpage),
85 'season': int_or_none(season),
86 'url': extract_attributes(get_element_html_by_class('audiofile', webpage) or '').get('href'),
87 }
88
89
90class RadioComercialPlaylistIE(InfoExtractor):
91 _VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/(?P<id>[\w-]+)(?:/t?(?P<season>\d+))?/?(?:$|[?#])'
92 _TESTS = [{
93 'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3',
94 'info_dict': {
95 'id': 'convenca-me-num-minuto_t3',
96 'title': 'Convença-me num Minuto - Temporada 3',
97 },
98 'playlist_mincount': 32
99 }, {
100 'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao',
101 'info_dict': {
102 'id': 'o-homem-que-mordeu-o-cao',
103 'title': 'O Homem Que Mordeu o Cão',
104 },
105 'playlist_mincount': 19
106 }, {
107 'url': 'https://radiocomercial.pt/podcasts/as-minhas-coisas-favoritas',
108 'info_dict': {
109 'id': 'as-minhas-coisas-favoritas',
110 'title': 'As Minhas Coisas Favoritas',
111 },
112 'playlist_mincount': 131
113 }, {
114 'url': 'https://radiocomercial.pt/podcasts/tnt-todos-no-top/t2023',
115 'info_dict': {
116 'id': 'tnt-todos-no-top_t2023',
117 'title': 'TNT - Todos No Top - Temporada 2023',
118 },
119 'playlist_mincount': 39
120 }]
121
122 def _entries(self, url, playlist_id):
123 for page in itertools.count(1):
124 try:
125 webpage = self._download_webpage(
126 f'{url}/{page}', playlist_id, f'Downloading page {page}')
127 except ExtractorError as e:
128 if isinstance(e.cause, HTTPError) and e.cause.status == 404:
129 break
130 raise
131
132 episodes = get_elements_html_by_class('tm-ouvir-podcast', webpage)
133 if not episodes:
134 break
135 for url_path in traverse_obj(episodes, (..., {extract_attributes}, 'href')):
136 episode_url = urljoin(url, url_path)
137 if RadioComercialIE.suitable(episode_url):
138 yield episode_url
139
140 def _real_extract(self, url):
141 podcast, season = self._match_valid_url(url).group('id', 'season')
142 playlist_id = join_nonempty(podcast, season, delim='_t')
143 url = update_url(url, query=None, fragment=None)
144 webpage = self._download_webpage(url, playlist_id)
145
146 name = try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0])
147 title = name if name == season else join_nonempty(name, season, delim=' - Temporada ')
148
149 return self.playlist_from_matches(
150 self._entries(url, playlist_id), playlist_id, title, ie=RadioComercialIE)