X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/f14a2d838240e9e75fe52d4e381156064e90674c..5dbac313ae4e3e8521dfe2e1a6a048a98ff4b4fe:/yt_dlp/extractor/generic.py diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index ec1cbf005..3b8e1e957 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -1,121 +1,23 @@ import os import re +import types import urllib.parse import xml.etree.ElementTree -from . import gen_extractor_classes -from .common import InfoExtractor # isort: split -from .ant1newsgr import Ant1NewsGrEmbedIE -from .anvato import AnvatoIE -from .apa import APAIE -from .arcpublishing import ArcPublishingIE -from .arkena import ArkenaIE -from .arte import ArteTVEmbedIE -from .bitchute import BitChuteIE -from .blogger import BloggerIE -from .brightcove import BrightcoveLegacyIE, BrightcoveNewIE -from .channel9 import Channel9IE -from .cloudflarestream import CloudflareStreamIE +from .common import InfoExtractor from .commonprotocols import RtmpIE -from .condenast import CondeNastIE -from .dailymail import DailyMailIE -from .dailymotion import DailymotionIE -from .dbtv import DBTVIE -from .digiteka import DigitekaIE -from .drtuber import DrTuberIE -from .eagleplatform import EaglePlatformIE -from .ertgr import ERTWebtvEmbedIE -from .expressen import ExpressenIE -from .facebook import FacebookIE -from .foxnews import FoxNewsIE -from .gedidigital import GediDigitalIE -from .gfycat import GfycatIE -from .glomex import GlomexEmbedIE -from .googledrive import GoogleDriveIE -from .indavideo import IndavideoEmbedIE -from .instagram import InstagramIE -from .joj import JojIE -from .jwplatform import JWPlatformIE -from .kaltura import KalturaIE -from .kinja import KinjaEmbedIE -from .limelight import LimelightBaseIE -from .mainstreaming import MainStreamingIE -from .medialaan import MedialaanIE -from .mediaset import MediasetIE -from .mediasite import MediasiteIE -from .megaphone import MegaphoneIE -from .megatvcom import MegaTVComEmbedIE -from .mofosex import MofosexEmbedIE -from .mtv import MTVServicesEmbeddedIE -from .myvi import MyviIE -from .nbc import NBCSportsVPlayerIE -from .nexx import NexxEmbedIE, NexxIE -from .odnoklassniki import OdnoklassnikiIE -from .onionstudios import OnionStudiosIE -from .ooyala import OoyalaIE -from .panopto import PanoptoBaseIE -from .peertube import PeerTubeIE -from .piksel import PikselIE -from .pladform import PladformIE -from .pornhub import PornHubIE -from .rcs import RCSEmbedsIE -from .redtube import RedTubeIE -from .rumble import RumbleEmbedIE -from .rutube import RutubeIE -from .rutv import RUTVIE -from .ruutu import RuutuIE -from .senategov import SenateISVPIE -from .simplecast import SimplecastIE -from .soundcloud import SoundcloudEmbedIE -from .spankwire import SpankwireIE -from .sportbox import SportBoxIE -from .spotify import SpotifyBaseIE -from .springboardplatform import SpringboardPlatformIE -from .substack import SubstackIE -from .svt import SVTIE -from .teachable import TeachableIE -from .ted import TedEmbedIE -from .theplatform import ThePlatformIE -from .threeqsdn import ThreeQSDNIE -from .tiktok import TikTokIE -from .tnaflix import TNAFlixNetworkEmbedIE -from .tube8 import Tube8IE -from .tunein import TuneInBaseIE -from .tvc import TVCIE -from .tvopengr import TVOpenGrEmbedIE -from .tvp import TVPEmbedIE -from .twentymin import TwentyMinutenIE -from .udn import UDNEmbedIE -from .ustream import UstreamIE -from .vbox7 import Vbox7IE -from .vice import ViceIE -from .videa import VideaIE -from .videomore import VideomoreIE -from .videopress import VideoPressIE -from .viewlift import ViewLiftEmbedIE -from .vimeo import VHXEmbedIE, VimeoIE -from .viqeo import ViqeoIE -from .vk import VKIE -from .vshare import VShareIE -from .vzaar import VzaarIE -from .washingtonpost import WashingtonPostIE -from .webcaster import WebcasterFeedIE -from .wimtv import WimTVIE -from .wistia import WistiaIE -from .xfileshare import XFileShareIE -from .xhamster import XHamsterEmbedIE -from .yapfiles import YapFilesIE -from .youporn import YouPornIE from .youtube import YoutubeIE -from .zype import ZypeIE from ..compat import compat_etree_fromstring from ..utils import ( KNOWN_EXTENSIONS, + MEDIA_EXTENSIONS, ExtractorError, UnsupportedError, determine_ext, + determine_protocol, dict_get, - float_or_none, + extract_basic_auth, + filter_dict, format_field, int_or_none, is_html, @@ -127,11 +29,16 @@ parse_resolution, smuggle_url, str_or_none, + traverse_obj, try_call, unescapeHTML, unified_timestamp, unsmuggle_url, + update_url_query, url_or_none, + urlhandle_detect_ext, + urljoin, + variadic, xpath_attr, xpath_text, xpath_with_ns, @@ -153,7 +60,9 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'trailer', 'upload_date': '20100513', - } + 'direct': True, + 'timestamp': 1273772943.0, + }, }, # Direct link to media delivered compressed (until Accept-Encoding is *) { @@ -166,7 +75,7 @@ class GenericIE(InfoExtractor): 'upload_date': '20140522', }, 'expected_warnings': [ - 'URL could be a direct video link, returning it as such.' + 'URL could be a direct video link, returning it as such.', ], 'skip': 'URL invalid', }, @@ -196,10 +105,12 @@ class GenericIE(InfoExtractor): 'ext': 'webm', 'title': '5_Lennart_Poettering_-_Systemd', 'upload_date': '20141120', + 'direct': True, + 'timestamp': 1416498816.0, }, 'expected_warnings': [ - 'URL could be a direct video link, returning it as such.' - ] + 'URL could be a direct video link, returning it as such.', + ], }, # RSS feed { @@ -207,7 +118,7 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': 'https://phihag.de/2014/youtube-dl/rss2.xml', 'title': 'Zero Punctuation', - 'description': 're:.*groundbreaking video review series.*' + 'description': 're:.*groundbreaking video review series.*', }, 'playlist_mincount': 11, }, @@ -228,6 +139,7 @@ class GenericIE(InfoExtractor): 'upload_date': '20201204', }, }], + 'skip': 'Dead link', }, # RSS feed with item with description and thumbnails { @@ -240,12 +152,12 @@ class GenericIE(InfoExtractor): 'playlist': [{ 'info_dict': { 'ext': 'm4a', - 'id': 'c1c879525ce2cb640b344507e682c36d', + 'id': '818a5d38-01cd-152f-2231-ee479677fa82', 'title': 're:Hydrogen!', 'description': 're:.*In this episode we are going.*', 'timestamp': 1567977776, 'upload_date': '20190908', - 'duration': 459, + 'duration': 423, 'thumbnail': r're:^https?://.*\.jpg$', 'episode_number': 1, 'season_number': 1, @@ -362,6 +274,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': '404 Not Found', }, # MPD from http://dash-mse-test.appspot.com/media.html { @@ -373,6 +286,7 @@ class GenericIE(InfoExtractor): 'title': 'car-20120827-manifest', 'formats': 'mincount:9', 'upload_date': '20130904', + 'timestamp': 1378272859.0, }, }, # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 @@ -413,14 +327,14 @@ class GenericIE(InfoExtractor): 'id': 'cmQHVoWB5FY', 'ext': 'mp4', 'upload_date': '20130224', - 'uploader_id': 'TheVerge', + 'uploader_id': '@TheVerge', 'description': r're:^Chris Ziegler takes a look at the\.*', 'uploader': 'The Verge', 'title': 'First Firefox OS phones side-by-side', }, 'params': { 'skip_download': False, - } + }, }, { # redirect in Refresh HTTP header @@ -446,7 +360,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'uploader': 'www.hodiho.fr', 'title': 'R\u00e9gis plante sa Jeep', - } + }, }, # bandcamp page with custom domain { @@ -460,228 +374,6 @@ class GenericIE(InfoExtractor): }, 'skip': 'There is a limit of 200 free downloads / month for the test song', }, - { - # embedded brightcove video - # it also tests brightcove videos that need to set the 'Referer' - # in the http requests - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', - 'info_dict': { - 'id': '2765128793001', - 'ext': 'mp4', - 'title': 'Le cours de bourse : l’analyse technique', - 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9', - 'uploader': 'BFM BUSINESS', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # embedded with itemprop embedURL and video id spelled as `idVideo` - 'add_id': ['BrightcoveLegacy'], - 'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/', - 'info_dict': { - 'id': '5255628253001', - 'ext': 'mp4', - 'title': 'md5:37c519b1128915607601e75a87995fc0', - 'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26', - 'uploader': 'BFM BUSINESS', - 'uploader_id': '876450612001', - 'timestamp': 1482255315, - 'upload_date': '20161220', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # https://github.com/ytdl-org/youtube-dl/issues/2253 - 'url': 'http://bcove.me/i6nfkrc3', - 'md5': '0ba9446db037002366bab3b3eb30c88c', - 'info_dict': { - 'id': '3101154703001', - 'ext': 'mp4', - 'title': 'Still no power', - 'uploader': 'thestar.com', - 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', - }, - 'add_ie': ['BrightcoveLegacy'], - 'skip': 'video gone', - }, - { - 'url': 'http://www.championat.com/video/football/v/87/87499.html', - 'md5': 'fb973ecf6e4a78a67453647444222983', - 'info_dict': { - 'id': '3414141473001', - 'ext': 'mp4', - 'title': 'Видео. Удаление Дзагоева (ЦСКА)', - 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"', - 'uploader': 'Championat', - }, - }, - { - # https://github.com/ytdl-org/youtube-dl/issues/3541 - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', - 'info_dict': { - 'id': '3866516442001', - 'ext': 'mp4', - 'title': 'Leer mij vrouwen kennen: Aflevering 1', - 'description': 'Leer mij vrouwen kennen: Aflevering 1', - 'uploader': 'SBS Broadcasting', - }, - 'skip': 'Restricted to Netherlands', - 'params': { - 'skip_download': True, # m3u8 download - }, - }, - { - # Brightcove video in