X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/241464919271278831f23b3a086dcf57aeb80d3b..21a73e9f39386d4ddfe07a9049cc33c13afed493:/yt_dlp/extractor/generic.py diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index f594d02c2..130beb871 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -74,6 +74,7 @@ from .ted import TedEmbedIE from .theplatform import ThePlatformIE from .threeqsdn import ThreeQSDNIE +from .tiktok import TikTokIE from .tnaflix import TNAFlixNetworkEmbedIE from .tube8 import Tube8IE from .tunein import TuneInBaseIE @@ -129,6 +130,7 @@ sanitized_Request, smuggle_url, str_or_none, + try_call, unescapeHTML, unified_timestamp, unsmuggle_url, @@ -2526,6 +2528,41 @@ class GenericIE(InfoExtractor): 'upload_date': '20220504', }, }, + { + # Webpage contains double BOM + 'url': 'https://www.filmarkivet.se/movies/paris-d-moll/', + 'md5': 'df02cadc719dcc63d43288366f037754', + 'info_dict': { + 'id': 'paris-d-moll', + 'ext': 'mp4', + 'upload_date': '20220518', + 'title': 'Paris d-moll', + 'description': 'md5:319e37ea5542293db37e1e13072fe330', + 'thumbnail': 'https://www.filmarkivet.se/wp-content/uploads/parisdmoll2.jpg', + 'timestamp': 1652833414, + 'age_limit': 0, + } + }, { + 'url': 'https://www.skimag.com/video/ski-people-1980/', + 'info_dict': { + 'id': 'ski-people-1980', + 'title': 'Ski People (1980)', + }, + 'playlist_count': 1, + 'playlist': [{ + 'md5': '022a7e31c70620ebec18deeab376ee03', + 'info_dict': { + 'id': 'YTmgRiNU', + 'ext': 'mp4', + 'title': '1980 Ski People', + 'timestamp': 1610407738, + 'description': 'md5:cf9c3d101452c91e141f292b19fe4843', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/YTmgRiNU/poster.jpg?width=720', + 'duration': 5688.0, + 'upload_date': '20210111', + } + }] + }, ] def report_following_redirect(self, new_url): @@ -2536,66 +2573,44 @@ def report_detected(self, name): self._downloader.write_debug(f'Identified a {name}') def _extract_rss(self, url, video_id, doc): - playlist_title = doc.find('./channel/title').text - playlist_desc_el = doc.find('./channel/description') - playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text - NS_MAP = { 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', } entries = [] for it in doc.findall('./channel/item'): - next_url = None - enclosure_nodes = it.findall('./enclosure') - for e in enclosure_nodes: - next_url = e.attrib.get('url') - if next_url: - break - - if not next_url: - next_url = xpath_text(it, 'link', fatal=False) - + next_url = next( + (e.attrib.get('url') for e in it.findall('./enclosure')), + xpath_text(it, 'link', fatal=False)) if not next_url: continue - if it.find('guid').text is not None: - next_url = smuggle_url(next_url, {'force_videoid': it.find('guid').text}) + guid = try_call(lambda: it.find('guid').text) + if guid: + next_url = smuggle_url(next_url, {'force_videoid': guid}) def itunes(key): - return xpath_text( - it, xpath_with_ns('./itunes:%s' % key, NS_MAP), - default=None) - - duration = itunes('duration') - explicit = (itunes('explicit') or '').lower() - if explicit in ('true', 'yes'): - age_limit = 18 - elif explicit in ('false', 'no'): - age_limit = 0 - else: - age_limit = None + return xpath_text(it, xpath_with_ns(f'./itunes:{key}', NS_MAP), default=None) entries.append({ '_type': 'url_transparent', 'url': next_url, - 'title': it.find('title').text, + 'title': try_call(lambda: it.find('title').text), 'description': xpath_text(it, 'description', default=None), - 'timestamp': unified_timestamp( - xpath_text(it, 'pubDate', default=None)), - 'duration': int_or_none(duration) or parse_duration(duration), + 'timestamp': unified_timestamp(xpath_text(it, 'pubDate', default=None)), + 'duration': parse_duration(itunes('duration')), 'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')), 'episode': itunes('title'), 'episode_number': int_or_none(itunes('episode')), 'season_number': int_or_none(itunes('season')), - 'age_limit': age_limit, + 'age_limit': {'true': 18, 'yes': 18, 'false': 0, 'no': 0}.get((itunes('explicit') or '').lower()), }) return { '_type': 'playlist', 'id': url, - 'title': playlist_title, - 'description': playlist_desc, + 'title': try_call(lambda: doc.find('./channel/title').text), + 'description': try_call(lambda: doc.find('./channel/description').text), 'entries': entries, } @@ -2975,7 +2990,7 @@ def _real_extract(self, url): if vimeo_urls: return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) - vhx_url = VHXEmbedIE._extract_url(webpage) + vhx_url = VHXEmbedIE._extract_url(url, webpage) if vhx_url: return self.url_result(vhx_url, VHXEmbedIE.ie_key()) @@ -3762,6 +3777,11 @@ def _real_extract(self, url): if ruutu_urls: return self.playlist_from_matches(ruutu_urls, video_id, video_title) + # Look for Tiktok embeds + tiktok_urls = TikTokIE._extract_urls(webpage) + if tiktok_urls: + return self.playlist_from_matches(tiktok_urls, video_id, video_title) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: @@ -3871,15 +3891,10 @@ def _real_extract(self, url): json_ld = self._search_json_ld(webpage, video_id, default={}) if json_ld.get('url') not in (url, None): self.report_detected('JSON LD') - if determine_ext(json_ld['url']) == 'm3u8': - json_ld['formats'], json_ld['subtitles'] = self._extract_m3u8_formats_and_subtitles( - json_ld['url'], video_id, 'mp4') - json_ld.pop('url') - self._sort_formats(json_ld['formats']) - else: - json_ld['_type'] = 'url_transparent' - json_ld['url'] = smuggle_url(json_ld['url'], {'force_videoid': video_id, 'to_generic': True}) - return merge_dicts(json_ld, info_dict) + return merge_dicts({ + '_type': 'url_transparent', + 'url': smuggle_url(json_ld['url'], {'force_videoid': video_id, 'to_generic': True}), + }, json_ld, info_dict) def check_video(vurl): if YoutubeIE.suitable(vurl):