[yt-dlp.git] / yt_dlp / extractor / radiocomercial.py

import itertools

from .common import InfoExtractor
from ..networking.exceptions import HTTPError
from ..utils import (
    ExtractorError,
    extract_attributes,
    get_element_by_class,
    get_element_html_by_class,
    get_element_text_and_html_by_tag,
    get_elements_html_by_class,
    int_or_none,
    join_nonempty,
    try_call,
    unified_strdate,
    update_url,
    urljoin
)
from ..utils.traversal import traverse_obj


class RadioComercialIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/[^/?#]+/t?(?P<season>\d+)/(?P<id>[\w-]+)'
    _TESTS = [{
        'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao/t6/taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas#page-content-wrapper',
        'md5': '5f4fe8e485b29d2e8fd495605bc2c7e4',
        'info_dict': {
            'id': 'taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas',
            'ext': 'mp3',
            'title': 'Taylor Swift entranhando-se que nem uma espada no ventre dos fãs.',
            'release_date': '20231025',
            'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
            'season': 6
        }
    }, {
        'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3/convenca-me-num-minuto-que-os-lobisomens-existem',
        'md5': '47e96c273aef96a8eb160cd6cf46d782',
        'info_dict': {
            'id': 'convenca-me-num-minuto-que-os-lobisomens-existem',
            'ext': 'mp3',
            'title': 'Convença-me num minuto que os lobisomens existem',
            'release_date': '20231026',
            'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
            'season': 3
        }
    }, {
        'url': 'https://radiocomercial.pt/podcasts/inacreditavel-by-ines-castel-branco/t2/o-desastre-de-aviao',
        'md5': '69be64255420fec23b7259955d771e54',
        'info_dict': {
            'id': 'o-desastre-de-aviao',
            'ext': 'mp3',
            'title': 'O desastre de avião',
            'description': 'md5:8a82beeb372641614772baab7246245f',
            'release_date': '20231101',
            'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
            'season': 2
        },
        'params': {
            # inconsistant md5
            'skip_download': True,
        },
    }, {
        'url': 'https://radiocomercial.pt/podcasts/tnt-todos-no-top/2023/t-n-t-29-de-outubro',
        'md5': '91d32d4d4b1407272068b102730fc9fa',
        'info_dict': {
            'id': 't-n-t-29-de-outubro',
            'ext': 'mp3',
            'title': 'T.N.T 29 de outubro',
            'release_date': '20231029',
            'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
            'season': 2023
        }
    }]

    def _real_extract(self, url):
        video_id, season = self._match_valid_url(url).group('id', 'season')
        webpage = self._download_webpage(url, video_id)
        return {
            'id': video_id,
            'title': self._html_extract_title(webpage),
            'description': self._og_search_description(webpage, default=None),
            'release_date': unified_strdate(get_element_by_class(
                'date', get_element_html_by_class('descriptions', webpage) or '')),
            'thumbnail': self._og_search_thumbnail(webpage),
            'season': int_or_none(season),
            'url': extract_attributes(get_element_html_by_class('audiofile', webpage) or '').get('href'),
        }


class RadioComercialPlaylistIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/(?P<id>[\w-]+)(?:/t?(?P<season>\d+))?/?(?:$|[?#])'
    _TESTS = [{
        'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3',
        'info_dict': {
            'id': 'convenca-me-num-minuto_t3',
            'title': 'Convença-me num Minuto - Temporada 3',
        },
        'playlist_mincount': 32
    }, {
        'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao',
        'info_dict': {
            'id': 'o-homem-que-mordeu-o-cao',
            'title': 'O Homem Que Mordeu o Cão',
        },
        'playlist_mincount': 19
    }, {
        'url': 'https://radiocomercial.pt/podcasts/as-minhas-coisas-favoritas',
        'info_dict': {
            'id': 'as-minhas-coisas-favoritas',
            'title': 'As Minhas Coisas Favoritas',
        },
        'playlist_mincount': 131
    }, {
        'url': 'https://radiocomercial.pt/podcasts/tnt-todos-no-top/t2023',
        'info_dict': {
            'id': 'tnt-todos-no-top_t2023',
            'title': 'TNT - Todos No Top - Temporada 2023',
        },
        'playlist_mincount': 39
    }]

    def _entries(self, url, playlist_id):
        for page in itertools.count(1):
            try:
                webpage = self._download_webpage(
                    f'{url}/{page}', playlist_id, f'Downloading page {page}')
            except ExtractorError as e:
                if isinstance(e.cause, HTTPError) and e.cause.status == 404:
                    break
                raise

            episodes = get_elements_html_by_class('tm-ouvir-podcast', webpage)
            if not episodes:
                break
            for url_path in traverse_obj(episodes, (..., {extract_attributes}, 'href')):
                episode_url = urljoin(url, url_path)
                if RadioComercialIE.suitable(episode_url):
                    yield episode_url

    def _real_extract(self, url):
        podcast, season = self._match_valid_url(url).group('id', 'season')
        playlist_id = join_nonempty(podcast, season, delim='_t')
        url = update_url(url, query=None, fragment=None)
        webpage = self._download_webpage(url, playlist_id)

        name = try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0])
        title = name if name == season else join_nonempty(name, season, delim=' - Temporada ')

        return self.playlist_from_matches(
            self._entries(url, playlist_id), playlist_id, title, ie=RadioComercialIE)
Commit	Line	Data
ef12dbdc S	1	import itertools
	2
	3	from .common import InfoExtractor
	4	from ..networking.exceptions import HTTPError
	5	from ..utils import (
	6	ExtractorError,
	7	extract_attributes,
	8	get_element_by_class,
	9	get_element_html_by_class,
	10	get_element_text_and_html_by_tag,
	11	get_elements_html_by_class,
	12	int_or_none,
	13	join_nonempty,
	14	try_call,
	15	unified_strdate,
	16	update_url,
	17	urljoin
	18	)
	19	from ..utils.traversal import traverse_obj
	20
	21
	22	class RadioComercialIE(InfoExtractor):
	23	_VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/[^/?#]+/t?(?P<season>\d+)/(?P<id>[\w-]+)'
	24	_TESTS = [{
	25	'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao/t6/taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas#page-content-wrapper',
	26	'md5': '5f4fe8e485b29d2e8fd495605bc2c7e4',
	27	'info_dict': {
	28	'id': 'taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas',
	29	'ext': 'mp3',
	30	'title': 'Taylor Swift entranhando-se que nem uma espada no ventre dos fãs.',
	31	'release_date': '20231025',
	32	'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
	33	'season': 6
	34	}
	35	}, {
	36	'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3/convenca-me-num-minuto-que-os-lobisomens-existem',
	37	'md5': '47e96c273aef96a8eb160cd6cf46d782',
	38	'info_dict': {
	39	'id': 'convenca-me-num-minuto-que-os-lobisomens-existem',
	40	'ext': 'mp3',
	41	'title': 'Convença-me num minuto que os lobisomens existem',
	42	'release_date': '20231026',
	43	'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
	44	'season': 3
	45	}
	46	}, {
	47	'url': 'https://radiocomercial.pt/podcasts/inacreditavel-by-ines-castel-branco/t2/o-desastre-de-aviao',
	48	'md5': '69be64255420fec23b7259955d771e54',
	49	'info_dict': {
	50	'id': 'o-desastre-de-aviao',
	51	'ext': 'mp3',
	52	'title': 'O desastre de avião',
	53	'description': 'md5:8a82beeb372641614772baab7246245f',
	54	'release_date': '20231101',
	55	'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
	56	'season': 2
	57	},
	58	'params': {
	59	# inconsistant md5
	60	'skip_download': True,
	61	},
	62	}, {
	63	'url': 'https://radiocomercial.pt/podcasts/tnt-todos-no-top/2023/t-n-t-29-de-outubro',
	64	'md5': '91d32d4d4b1407272068b102730fc9fa',
65	'info_dict': {
66	'id': 't-n-t-29-de-outubro',
67	'ext': 'mp3',
68	'title': 'T.N.T 29 de outubro',
69	'release_date': '20231029',
70	'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
71	'season': 2023
72	}
73	}]
74
75	def _real_extract(self, url):
76	video_id, season = self._match_valid_url(url).group('id', 'season')
77	webpage = self._download_webpage(url, video_id)
78	return {
79	'id': video_id,
80	'title': self._html_extract_title(webpage),
81	'description': self._og_search_description(webpage, default=None),
82	'release_date': unified_strdate(get_element_by_class(
83	'date', get_element_html_by_class('descriptions', webpage) or '')),
84	'thumbnail': self._og_search_thumbnail(webpage),
85	'season': int_or_none(season),
86	'url': extract_attributes(get_element_html_by_class('audiofile', webpage) or '').get('href'),
87	}
88
89
90	class RadioComercialPlaylistIE(InfoExtractor):
91	_VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/(?P<id>[\w-]+)(?:/t?(?P<season>\d+))?/?(?:$\|[?#])'
92	_TESTS = [{
93	'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3',
94	'info_dict': {
95	'id': 'convenca-me-num-minuto_t3',
96	'title': 'Convença-me num Minuto - Temporada 3',
97	},
98	'playlist_mincount': 32
99	}, {
100	'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao',
101	'info_dict': {
102	'id': 'o-homem-que-mordeu-o-cao',
103	'title': 'O Homem Que Mordeu o Cão',
104	},
105	'playlist_mincount': 19
106	}, {
107	'url': 'https://radiocomercial.pt/podcasts/as-minhas-coisas-favoritas',
108	'info_dict': {
109	'id': 'as-minhas-coisas-favoritas',
110	'title': 'As Minhas Coisas Favoritas',
111	},
112	'playlist_mincount': 131
113	}, {
114	'url': 'https://radiocomercial.pt/podcasts/tnt-todos-no-top/t2023',
115	'info_dict': {
116	'id': 'tnt-todos-no-top_t2023',
117	'title': 'TNT - Todos No Top - Temporada 2023',
118	},
119	'playlist_mincount': 39
120	}]
121
122	def _entries(self, url, playlist_id):
123	for page in itertools.count(1):
124	try:
125	webpage = self._download_webpage(
126	f'{url}/{page}', playlist_id, f'Downloading page {page}')
127	except ExtractorError as e:
128	if isinstance(e.cause, HTTPError) and e.cause.status == 404:
129	break
130	raise
131
132	episodes = get_elements_html_by_class('tm-ouvir-podcast', webpage)
133	if not episodes:
134	break
135	for url_path in traverse_obj(episodes, (..., {extract_attributes}, 'href')):
136	episode_url = urljoin(url, url_path)
137	if RadioComercialIE.suitable(episode_url):
138	yield episode_url
139
140	def _real_extract(self, url):
141	podcast, season = self._match_valid_url(url).group('id', 'season')
142	playlist_id = join_nonempty(podcast, season, delim='_t')
143	url = update_url(url, query=None, fragment=None)
144	webpage = self._download_webpage(url, playlist_id)
145
146	name = try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0])
147	title = name if name == season else join_nonempty(name, season, delim=' - Temporada ')
148
149	return self.playlist_from_matches(
150	self._entries(url, playlist_id), playlist_id, title, ie=RadioComercialIE)