X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/9f517bb1f3f7462ae91624132f98125525d2cea9..5dbac313ae4e3e8521dfe2e1a6a048a98ff4b4fe:/yt_dlp/extractor/generic.py diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index baff0280f..3b8e1e957 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -1,149 +1,55 @@ -# coding: utf-8 - -from __future__ import unicode_literals - import os import re -import sys +import types +import urllib.parse +import xml.etree.ElementTree from .common import InfoExtractor +from .commonprotocols import RtmpIE from .youtube import YoutubeIE -from ..compat import ( - compat_etree_fromstring, - compat_str, - compat_urllib_parse_unquote, - compat_urlparse, - compat_xml_parse_error, -) +from ..compat import compat_etree_fromstring from ..utils import ( - determine_ext, + KNOWN_EXTENSIONS, + MEDIA_EXTENSIONS, ExtractorError, - float_or_none, - HEADRequest, + UnsupportedError, + determine_ext, + determine_protocol, + dict_get, + extract_basic_auth, + filter_dict, + format_field, int_or_none, is_html, js_to_json, - KNOWN_EXTENSIONS, merge_dicts, mimetype2ext, orderedSet, parse_duration, - sanitized_Request, + parse_resolution, smuggle_url, + str_or_none, + traverse_obj, + try_call, unescapeHTML, unified_timestamp, unsmuggle_url, - UnsupportedError, + update_url_query, url_or_none, + urlhandle_detect_ext, + urljoin, + variadic, xpath_attr, xpath_text, xpath_with_ns, ) -from .commonprotocols import RtmpIE -from .brightcove import ( - BrightcoveLegacyIE, - BrightcoveNewIE, -) -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .nbc import NBCSportsVPlayerIE -from .ooyala import OoyalaIE -from .rutv import RUTVIE -from .tvc import TVCIE -from .sportbox import SportBoxIE -from .myvi import MyviIE -from .condenast import CondeNastIE -from .udn import UDNEmbedIE -from .senategov import SenateISVPIE -from .svt import SVTIE -from .pornhub import PornHubIE -from .xhamster import XHamsterEmbedIE -from .tnaflix import TNAFlixNetworkEmbedIE -from .drtuber import DrTuberIE -from .redtube import RedTubeIE -from .tube8 import Tube8IE -from .mofosex import MofosexEmbedIE -from .spankwire import SpankwireIE -from .youporn import YouPornIE -from .vimeo import ( - VimeoIE, - VHXEmbedIE, -) -from .dailymotion import DailymotionIE -from .dailymail import DailyMailIE -from .onionstudios import OnionStudiosIE -from .viewlift import ViewLiftEmbedIE -from .mtv import MTVServicesEmbeddedIE -from .pladform import PladformIE -from .videomore import VideomoreIE -from .webcaster import WebcasterFeedIE -from .googledrive import GoogleDriveIE -from .jwplatform import JWPlatformIE -from .digiteka import DigitekaIE -from .arkena import ArkenaIE -from .instagram import InstagramIE -from .threeqsdn import ThreeQSDNIE -from .theplatform import ThePlatformIE -from .kaltura import KalturaIE -from .eagleplatform import EaglePlatformIE -from .facebook import FacebookIE -from .soundcloud import SoundcloudEmbedIE -from .tunein import TuneInBaseIE -from .vbox7 import Vbox7IE -from .dbtv import DBTVIE -from .piksel import PikselIE -from .videa import VideaIE -from .twentymin import TwentyMinutenIE -from .ustream import UstreamIE -from .arte import ArteTVEmbedIE -from .videopress import VideoPressIE -from .rutube import RutubeIE -from .limelight import LimelightBaseIE -from .anvato import AnvatoIE -from .washingtonpost import WashingtonPostIE -from .wistia import WistiaIE -from .mediaset import MediasetIE -from .joj import JojIE -from .megaphone import MegaphoneIE -from .vzaar import VzaarIE -from .channel9 import Channel9IE -from .vshare import VShareIE -from .mediasite import MediasiteIE -from .springboardplatform import SpringboardPlatformIE -from .yapfiles import YapFilesIE -from .vice import ViceIE -from .xfileshare import XFileShareIE -from .cloudflarestream import CloudflareStreamIE -from .peertube import PeerTubeIE -from .teachable import TeachableIE -from .indavideo import IndavideoEmbedIE -from .apa import APAIE -from .foxnews import FoxNewsIE -from .viqeo import ViqeoIE -from .expressen import ExpressenIE -from .zype import ZypeIE -from .odnoklassniki import OdnoklassnikiIE -from .vk import VKIE -from .kinja import KinjaEmbedIE -from .gedidigital import GediDigitalIE -from .rcs import RCSEmbedsIE -from .bitchute import BitChuteIE -from .rumble import RumbleEmbedIE -from .arcpublishing import ArcPublishingIE -from .medialaan import MedialaanIE -from .simplecast import SimplecastIE -from .wimtv import WimTVIE -from .tvp import TVPEmbedIE -from .blogger import BloggerIE -from .gfycat import GfycatIE class GenericIE(InfoExtractor): IE_DESC = 'Generic downloader that works on some sites' _VALID_URL = r'.*' IE_NAME = 'generic' + _NETRC_MACHINE = False # Suppress username warning _TESTS = [ # Direct link to a video { @@ -154,7 +60,9 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'trailer', 'upload_date': '20100513', - } + 'direct': True, + 'timestamp': 1273772943.0, + }, }, # Direct link to media delivered compressed (until Accept-Encoding is *) { @@ -167,7 +75,7 @@ class GenericIE(InfoExtractor): 'upload_date': '20140522', }, 'expected_warnings': [ - 'URL could be a direct video link, returning it as such.' + 'URL could be a direct video link, returning it as such.', ], 'skip': 'URL invalid', }, @@ -197,18 +105,20 @@ class GenericIE(InfoExtractor): 'ext': 'webm', 'title': '5_Lennart_Poettering_-_Systemd', 'upload_date': '20141120', + 'direct': True, + 'timestamp': 1416498816.0, }, 'expected_warnings': [ - 'URL could be a direct video link, returning it as such.' - ] + 'URL could be a direct video link, returning it as such.', + ], }, # RSS feed { 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', 'info_dict': { - 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'id': 'https://phihag.de/2014/youtube-dl/rss2.xml', 'title': 'Zero Punctuation', - 'description': 're:.*groundbreaking video review series.*' + 'description': 're:.*groundbreaking video review series.*', }, 'playlist_mincount': 11, }, @@ -229,6 +139,7 @@ class GenericIE(InfoExtractor): 'upload_date': '20201204', }, }], + 'skip': 'Dead link', }, # RSS feed with item with description and thumbnails { @@ -241,16 +152,19 @@ class GenericIE(InfoExtractor): 'playlist': [{ 'info_dict': { 'ext': 'm4a', - 'id': 'c1c879525ce2cb640b344507e682c36d', + 'id': '818a5d38-01cd-152f-2231-ee479677fa82', 'title': 're:Hydrogen!', 'description': 're:.*In this episode we are going.*', 'timestamp': 1567977776, 'upload_date': '20190908', - 'duration': 459, + 'duration': 423, 'thumbnail': r're:^https?://.*\.jpg$', 'episode_number': 1, 'season_number': 1, 'age_limit': 0, + 'season': 'Season 1', + 'direct': True, + 'episode': 'Episode 1', }, }], 'params': { @@ -267,6 +181,16 @@ class GenericIE(InfoExtractor): }, 'playlist_mincount': 100, }, + # RSS feed with guid + { + 'url': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss', + 'info_dict': { + 'id': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss', + 'description': 'md5:be809a44b63b0c56fb485caf68685520', + 'title': 'The Little Red Podcast', + }, + 'playlist_mincount': 76, + }, # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng { 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', @@ -350,6 +274,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': '404 Not Found', }, # MPD from http://dash-mse-test.appspot.com/media.html { @@ -361,6 +286,7 @@ class GenericIE(InfoExtractor): 'title': 'car-20120827-manifest', 'formats': 'mincount:9', 'upload_date': '20130904', + 'timestamp': 1378272859.0, }, }, # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 @@ -401,14 +327,14 @@ class GenericIE(InfoExtractor): 'id': 'cmQHVoWB5FY', 'ext': 'mp4', 'upload_date': '20130224', - 'uploader_id': 'TheVerge', + 'uploader_id': '@TheVerge', 'description': r're:^Chris Ziegler takes a look at the\.*', 'uploader': 'The Verge', 'title': 'First Firefox OS phones side-by-side', }, 'params': { 'skip_download': False, - } + }, }, { # redirect in Refresh HTTP header @@ -434,7 +360,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'uploader': 'www.hodiho.fr', 'title': 'R\u00e9gis plante sa Jeep', - } + }, }, # bandcamp page with custom domain { @@ -448,228 +374,6 @@ class GenericIE(InfoExtractor): }, 'skip': 'There is a limit of 200 free downloads / month for the test song', }, - { - # embedded brightcove video - # it also tests brightcove videos that need to set the 'Referer' - # in the http requests - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', - 'info_dict': { - 'id': '2765128793001', - 'ext': 'mp4', - 'title': 'Le cours de bourse : l’analyse technique', - 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9', - 'uploader': 'BFM BUSINESS', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # embedded with itemprop embedURL and video id spelled as `idVideo` - 'add_id': ['BrightcoveLegacy'], - 'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/', - 'info_dict': { - 'id': '5255628253001', - 'ext': 'mp4', - 'title': 'md5:37c519b1128915607601e75a87995fc0', - 'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26', - 'uploader': 'BFM BUSINESS', - 'uploader_id': '876450612001', - 'timestamp': 1482255315, - 'upload_date': '20161220', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # https://github.com/ytdl-org/youtube-dl/issues/2253 - 'url': 'http://bcove.me/i6nfkrc3', - 'md5': '0ba9446db037002366bab3b3eb30c88c', - 'info_dict': { - 'id': '3101154703001', - 'ext': 'mp4', - 'title': 'Still no power', - 'uploader': 'thestar.com', - 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', - }, - 'add_ie': ['BrightcoveLegacy'], - 'skip': 'video gone', - }, - { - 'url': 'http://www.championat.com/video/football/v/87/87499.html', - 'md5': 'fb973ecf6e4a78a67453647444222983', - 'info_dict': { - 'id': '3414141473001', - 'ext': 'mp4', - 'title': 'Видео. Удаление Дзагоева (ЦСКА)', - 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"', - 'uploader': 'Championat', - }, - }, - { - # https://github.com/ytdl-org/youtube-dl/issues/3541 - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', - 'info_dict': { - 'id': '3866516442001', - 'ext': 'mp4', - 'title': 'Leer mij vrouwen kennen: Aflevering 1', - 'description': 'Leer mij vrouwen kennen: Aflevering 1', - 'uploader': 'SBS Broadcasting', - }, - 'skip': 'Restricted to Netherlands', - 'params': { - 'skip_download': True, # m3u8 download - }, - }, - { - # Brightcove video in