yt_dlp/extractor/la7.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..networking import HEADRequest
   5 from ..utils import float_or_none, int_or_none, parse_duration, unified_strdate
   6
   7
   8 class LA7IE(InfoExtractor):
   9     IE_NAME = 'la7.it'
  10     _VALID_URL = r'''(?x)https?://(?:
  11         (?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video|news)/|
  12         tg\.la7\.it/repliche-tgla7\?id=
  13     )(?P<id>.+)'''
  14
  15     _TESTS = [{
  16         # single quality video
  17         'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722',
  18         'md5': '8b613ffc0c4bf9b9e377169fc19c214c',
  19         'info_dict': {
  20             'id': 'inccool8-02-10-2015-163722',
  21             'ext': 'mp4',
  22             'title': 'Inc.Cool8',
  23             'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico',
  24             'thumbnail': 're:^https?://.*',
  25             'upload_date': '20151002',
  26             'formats': 'count:4',
  27         },
  28     }, {
  29         # multiple quality video
  30         'url': 'https://www.la7.it/calcio-femminile/news/il-gol-di-lindsey-thomas-fiorentina-vs-milan-serie-a-calcio-femminile-26-11-2022-461736',
  31         'md5': 'd2370e78f75e8d1238cb3a0db9a2eda3',
  32         'info_dict': {
  33             'id': 'il-gol-di-lindsey-thomas-fiorentina-vs-milan-serie-a-calcio-femminile-26-11-2022-461736',
  34             'ext': 'mp4',
  35             'title': 'Il gol di Lindsey Thomas | Fiorentina vs Milan | Serie A Calcio Femminile',
  36             'description': 'Il gol di Lindsey Thomas | Fiorentina vs Milan | Serie A Calcio Femminile',
  37             'thumbnail': 're:^https?://.*',
  38             'upload_date': '20221126',
  39             'formats': 'count:8',
  40         },
  41     }, {
  42         'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077',
  43         'only_matching': True,
  44     }]
  45     _HOST = 'https://awsvodpkg.iltrovatore.it'
  46
  47     def _generate_mp4_url(self, quality, m3u8_formats):
  48         for f in m3u8_formats:
  49             if f['vcodec'] != 'none' and quality in f['url']:
  50                 http_url = f'{self._HOST}{quality}.mp4'
  51
  52                 urlh = self._request_webpage(
  53                     HEADRequest(http_url), quality,
  54                     note='Check filesize', fatal=False)
  55                 if urlh:
  56                     http_f = f.copy()
  57                     del http_f['manifest_url']
  58                     http_f.update({
  59                         'format_id': http_f['format_id'].replace('hls-', 'https-'),
  60                         'url': http_url,
  61                         'protocol': 'https',
  62                         'filesize_approx': int_or_none(urlh.headers.get('Content-Length', None)),
  63                     })
  64                     return http_f
  65                 return None
  66
  67     def _real_extract(self, url):
  68         video_id = self._match_id(url)
  69         webpage = self._download_webpage(url, video_id)
  70
  71         if re.search(r'(?i)(drmsupport\s*:\s*true)\s*', webpage):
  72             self.report_drm(video_id)
  73
  74         video_path = self._search_regex(
  75             r'(/content/[\w/,]+?)\.mp4(?:\.csmil)?/master\.m3u8', webpage, 'video_path')
  76
  77         formats = self._extract_mpd_formats(
  78             f'{self._HOST}/local/dash/,{video_path}.mp4.urlset/manifest.mpd',
  79             video_id, mpd_id='dash', fatal=False)
  80         m3u8_formats = self._extract_m3u8_formats(
  81             f'{self._HOST}/local/hls/,{video_path}.mp4.urlset/master.m3u8',
  82             video_id, 'mp4', m3u8_id='hls', fatal=False)
  83         formats.extend(m3u8_formats)
  84
  85         for q in filter(None, video_path.split(',')):
  86             http_f = self._generate_mp4_url(q, m3u8_formats)
  87             if http_f:
  88                 formats.append(http_f)
  89
  90         return {
  91             'id': video_id,
  92             'title': self._og_search_title(webpage, default=None),
  93             'description': self._og_search_description(webpage, default=None),
  94             'thumbnail': self._og_search_thumbnail(webpage, default=None),
  95             'formats': formats,
  96             'upload_date': unified_strdate(self._search_regex(r'datetime="(.+?)"', webpage, 'upload_date', fatal=False))
  97         }
  98
  99
 100 class LA7PodcastEpisodeIE(InfoExtractor):
 101     IE_NAME = 'la7.it:pod:episode'
 102     _VALID_URL = r'https?://(?:www\.)?la7\.it/[^/]+/podcast/([^/]+-)?(?P<id>\d+)'
 103
 104     _TESTS = [{
 105         'url': 'https://www.la7.it/voicetown/podcast/la-carezza-delle-memoria-di-carlo-verdone-23-03-2021-371497',
 106         'md5': '7737d4d79b3c1a34b3de3e16297119ed',
 107         'info_dict': {
 108             'id': '371497',
 109             'ext': 'mp3',
 110             'title': '"La carezza delle memoria" di Carlo Verdone',
 111             'description': 'md5:5abf07c3c551a687db80af3f9ceb7d52',
 112             'thumbnail': 'https://www.la7.it/sites/default/files/podcast/371497.jpg',
 113             'upload_date': '20210323',
 114         },
 115     }, {
 116         # embed url
 117         'url': 'https://www.la7.it/embed/podcast/371497',
 118         'only_matching': True,
 119     }, {
 120         # date already in the title
 121         'url': 'https://www.la7.it/propagandalive/podcast/lintervista-di-diego-bianchi-ad-annalisa-cuzzocrea-puntata-del-1932021-20-03-2021-371130',
 122         'only_matching': True,
 123     }, {
 124         # title same as show_title
 125         'url': 'https://www.la7.it/otto-e-mezzo/podcast/otto-e-mezzo-26-03-2021-372340',
 126         'only_matching': True,
 127     }]
 128
 129     def _extract_info(self, webpage, video_id=None, ppn=None):
 130         if not video_id:
 131             video_id = self._search_regex(
 132                 r'data-nid=([\'"])(?P<vid>\d+)\1',
 133                 webpage, 'video_id', group='vid')
 134
 135         media_url = self._search_regex(
 136             (r'src\s*:\s*([\'"])(?P<url>\S+?mp3.+?)\1',
 137              r'data-podcast\s*=\s*([\'"])(?P<url>\S+?mp3.+?)\1'),
 138             webpage, 'media_url', group='url')
 139         formats = [{
 140             'url': media_url,
 141             'format_id': 'http-mp3',
 142             'ext': 'mp3',
 143             'acodec': 'mp3',
 144             'vcodec': 'none',
 145         }]
 146
 147         title = self._html_search_regex(
 148             (r'<div class="title">(?P<title>.+?)</',
 149              r'<title>(?P<title>[^<]+)</title>',
 150              r'title:\s*([\'"])(?P<title>.+?)\1'),
 151             webpage, 'title', group='title')
 152
 153         description = (
 154             self._html_search_regex(
 155                 (r'<div class="description">(.+?)</div>',
 156                  r'<div class="description-mobile">(.+?)</div>',
 157                  r'<div class="box-txt">([^<]+?)</div>',
 158                  r'<div class="field-content"><p>(.+?)</p></div>'),
 159                 webpage, 'description', default=None)
 160             or self._html_search_meta('description', webpage))
 161
 162         thumb = self._html_search_regex(
 163             (r'<div class="podcast-image"><img src="(.+?)"></div>',
 164              r'<div class="container-embed"[^<]+url\((.+?)\);">',
 165              r'<div class="field-content"><img src="(.+?)"'),
 166             webpage, 'thumbnail', fatal=False, default=None)
 167
 168         duration = parse_duration(self._html_search_regex(
 169             r'<span class="(?:durata|duration)">([\d:]+)</span>',
 170             webpage, 'duration', fatal=False, default=None))
 171
 172         date = self._html_search_regex(
 173             r'class="data">\s*(?:<span>)?([\d\.]+)\s*</',
 174             webpage, 'date', default=None)
 175
 176         date_alt = self._search_regex(
 177             r'(\d+[\./]\d+[\./]\d+)', title, 'date_alt', default=None)
 178         ppn = ppn or self._search_regex(
 179             r'ppN:\s*([\'"])(?P<ppn>.+?)\1',
 180             webpage, 'ppn', group='ppn', default=None)
 181         # if the date is not in the title
 182         # and title is the same as the show_title
 183         # add the date to the title
 184         if date and not date_alt and ppn and ppn.lower() == title.lower():
 185             title = f'{title} del {date}'
 186         return {
 187             'id': video_id,
 188             'title': title,
 189             'description': description,
 190             'duration': float_or_none(duration),
 191             'formats': formats,
 192             'thumbnail': thumb,
 193             'upload_date': unified_strdate(date),
 194         }
 195
 196     def _real_extract(self, url):
 197         video_id = self._match_id(url)
 198         webpage = self._download_webpage(url, video_id)
 199
 200         return self._extract_info(webpage, video_id)
 201
 202
 203 class LA7PodcastIE(LA7PodcastEpisodeIE):  # XXX: Do not subclass from concrete IE
 204     IE_NAME = 'la7.it:podcast'
 205     _VALID_URL = r'https?://(?:www\.)?la7\.it/(?P<id>[^/]+)/podcast/?(?:$|[#?])'
 206
 207     _TESTS = [{
 208         'url': 'https://www.la7.it/propagandalive/podcast',
 209         'info_dict': {
 210             'id': 'propagandalive',
 211             'title': 'Propaganda Live',
 212         },
 213         'playlist_mincount': 10,
 214     }]
 215
 216     def _real_extract(self, url):
 217         playlist_id = self._match_id(url)
 218         webpage = self._download_webpage(url, playlist_id)
 219
 220         title = (
 221             self._html_search_regex(
 222                 r'<h1.*?>(.+?)</h1>', webpage, 'title', fatal=False, default=None)
 223             or self._og_search_title(webpage))
 224         ppn = self._search_regex(
 225             r'window\.ppN\s*=\s*([\'"])(?P<ppn>.+?)\1',
 226             webpage, 'ppn', group='ppn', default=None)
 227
 228         entries = []
 229         for episode in re.finditer(
 230                 r'<div class="container-podcast-property">([\s\S]+?)(?:</div>\s*){3}',
 231                 webpage):
 232             entries.append(self._extract_info(episode.group(1), ppn=ppn))
 233
 234         return self.playlist_result(entries, playlist_id, title)