yt_dlp/extractor/radiocomercial.py

   1 import itertools
   2
   3 from .common import InfoExtractor
   4 from ..networking.exceptions import HTTPError
   5 from ..utils import (
   6     ExtractorError,
   7     extract_attributes,
   8     get_element_by_class,
   9     get_element_html_by_class,
  10     get_element_text_and_html_by_tag,
  11     get_elements_html_by_class,
  12     int_or_none,
  13     join_nonempty,
  14     try_call,
  15     unified_strdate,
  16     update_url,
  17     urljoin
  18 )
  19 from ..utils.traversal import traverse_obj
  20
  21
  22 class RadioComercialIE(InfoExtractor):
  23     _VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/[^/?#]+/t?(?P<season>\d+)/(?P<id>[\w-]+)'
  24     _TESTS = [{
  25         'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao/t6/taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas#page-content-wrapper',
  26         'md5': '5f4fe8e485b29d2e8fd495605bc2c7e4',
  27         'info_dict': {
  28             'id': 'taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas',
  29             'ext': 'mp3',
  30             'title': 'Taylor Swift entranhando-se que nem uma espada no ventre dos fãs.',
  31             'release_date': '20231025',
  32             'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
  33             'season': 'Season 6',
  34             'season_number': 6,
  35         }
  36     }, {
  37         'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3/convenca-me-num-minuto-que-os-lobisomens-existem',
  38         'md5': '47e96c273aef96a8eb160cd6cf46d782',
  39         'info_dict': {
  40             'id': 'convenca-me-num-minuto-que-os-lobisomens-existem',
  41             'ext': 'mp3',
  42             'title': 'Convença-me num minuto que os lobisomens existem',
  43             'release_date': '20231026',
  44             'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
  45             'season': 'Season 3',
  46             'season_number': 3,
  47         }
  48     }, {
  49         'url': 'https://radiocomercial.pt/podcasts/inacreditavel-by-ines-castel-branco/t2/o-desastre-de-aviao',
  50         'md5': '69be64255420fec23b7259955d771e54',
  51         'info_dict': {
  52             'id': 'o-desastre-de-aviao',
  53             'ext': 'mp3',
  54             'title': 'O desastre de avião',
  55             'description': 'md5:8a82beeb372641614772baab7246245f',
  56             'release_date': '20231101',
  57             'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
  58             'season': 'Season 2',
  59             'season_number': 2,
  60         },
  61         'params': {
  62             # inconsistant md5
  63             'skip_download': True,
  64         },
  65     }, {
  66         'url': 'https://radiocomercial.pt/podcasts/tnt-todos-no-top/2023/t-n-t-29-de-outubro',
  67         'md5': '91d32d4d4b1407272068b102730fc9fa',
  68         'info_dict': {
  69             'id': 't-n-t-29-de-outubro',
  70             'ext': 'mp3',
  71             'title': 'T.N.T 29 de outubro',
  72             'release_date': '20231029',
  73             'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
  74             'season': 'Season 2023',
  75             'season_number': 2023,
  76         }
  77     }]
  78
  79     def _real_extract(self, url):
  80         video_id, season = self._match_valid_url(url).group('id', 'season')
  81         webpage = self._download_webpage(url, video_id)
  82         return {
  83             'id': video_id,
  84             'title': self._html_extract_title(webpage),
  85             'description': self._og_search_description(webpage, default=None),
  86             'release_date': unified_strdate(get_element_by_class(
  87                 'date', get_element_html_by_class('descriptions', webpage) or '')),
  88             'thumbnail': self._og_search_thumbnail(webpage),
  89             'season_number': int_or_none(season),
  90             'url': extract_attributes(get_element_html_by_class('audiofile', webpage) or '').get('href'),
  91         }
  92
  93
  94 class RadioComercialPlaylistIE(InfoExtractor):
  95     _VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/(?P<id>[\w-]+)(?:/t?(?P<season>\d+))?/?(?:$|[?#])'
  96     _TESTS = [{
  97         'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3',
  98         'info_dict': {
  99             'id': 'convenca-me-num-minuto_t3',
 100             'title': 'Convença-me num Minuto - Temporada 3',
 101         },
 102         'playlist_mincount': 32
 103     }, {
 104         'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao',
 105         'info_dict': {
 106             'id': 'o-homem-que-mordeu-o-cao',
 107             'title': 'O Homem Que Mordeu o Cão',
 108         },
 109         'playlist_mincount': 19
 110     }, {
 111         'url': 'https://radiocomercial.pt/podcasts/as-minhas-coisas-favoritas',
 112         'info_dict': {
 113             'id': 'as-minhas-coisas-favoritas',
 114             'title': 'As Minhas Coisas Favoritas',
 115         },
 116         'playlist_mincount': 131
 117     }, {
 118         'url': 'https://radiocomercial.pt/podcasts/tnt-todos-no-top/t2023',
 119         'info_dict': {
 120             'id': 'tnt-todos-no-top_t2023',
 121             'title': 'TNT - Todos No Top - Temporada 2023',
 122         },
 123         'playlist_mincount': 39
 124     }]
 125
 126     def _entries(self, url, playlist_id):
 127         for page in itertools.count(1):
 128             try:
 129                 webpage = self._download_webpage(
 130                     f'{url}/{page}', playlist_id, f'Downloading page {page}')
 131             except ExtractorError as e:
 132                 if isinstance(e.cause, HTTPError) and e.cause.status == 404:
 133                     break
 134                 raise
 135
 136             episodes = get_elements_html_by_class('tm-ouvir-podcast', webpage)
 137             if not episodes:
 138                 break
 139             for url_path in traverse_obj(episodes, (..., {extract_attributes}, 'href')):
 140                 episode_url = urljoin(url, url_path)
 141                 if RadioComercialIE.suitable(episode_url):
 142                     yield episode_url
 143
 144     def _real_extract(self, url):
 145         podcast, season = self._match_valid_url(url).group('id', 'season')
 146         playlist_id = join_nonempty(podcast, season, delim='_t')
 147         url = update_url(url, query=None, fragment=None)
 148         webpage = self._download_webpage(url, playlist_id)
 149
 150         name = try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0])
 151         title = name if name == season else join_nonempty(name, season, delim=' - Temporada ')
 152
 153         return self.playlist_from_matches(
 154             self._entries(url, playlist_id), playlist_id, title, ie=RadioComercialIE)