10 import xml
.etree
.ElementTree
15 compat_urllib_parse_urlparse
,
26 _NO_DEFAULT
= object()
29 class InfoExtractor(object):
30 """Information Extractor class.
32 Information extractors are the classes that, given a URL, extract
33 information about the video (or videos) the URL refers to. This
34 information includes the real video URL, the video title, author and
35 others. The information is stored in a dictionary which is then
36 passed to the FileDownloader. The FileDownloader processes this
37 information possibly downloading the video to the file system, among
38 other possible outcomes.
40 The dictionaries must include the following fields:
43 title: Video title, unescaped.
45 Additionally, it must contain either a formats entry or a url one:
47 formats: A list of dictionaries for each format available, ordered
48 from worst to best quality.
51 * url Mandatory. The URL of the video file
52 * ext Will be calculated from url if missing
53 * format A human-readable description of the format
54 ("mp4 container with h264/opus").
55 Calculated from the format_id, width, height.
56 and format_note fields if missing.
57 * format_id A short description of the format
58 ("mp4_h264_opus" or "19").
59 Technically optional, but strongly recommended.
60 * format_note Additional info about the format
61 ("3D" or "DASH video")
62 * width Width of the video, if known
63 * height Height of the video, if known
64 * resolution Textual description of width and height
65 * tbr Average bitrate of audio and video in KBit/s
66 * abr Average audio bitrate in KBit/s
67 * acodec Name of the audio codec in use
68 * asr Audio sampling rate in Hertz
69 * vbr Average video bitrate in KBit/s
70 * vcodec Name of the video codec in use
71 * container Name of the container format
72 * filesize The number of bytes, if known in advance
73 * filesize_approx An estimate for the number of bytes
74 * player_url SWF Player URL (used for rtmpdump).
75 * protocol The protocol that will be used for the actual
77 "http", "https", "rtsp", "rtmp", "m3u8" or so.
78 * preference Order number of this format. If this field is
79 present and not None, the formats get sorted
80 by this field, regardless of all other values.
81 -1 for default (order by other properties),
82 -2 or smaller for less than default.
83 * quality Order number of the video quality of this
84 format, irrespective of the file format.
85 -1 for default (order by other properties),
86 -2 or smaller for less than default.
88 ext: Video filename extension.
89 format: The video format, defaults to ext (used for --get-format)
90 player_url: SWF Player URL (used for rtmpdump).
92 The following fields are optional:
94 display_id An alternative identifier for the video, not necessarily
95 unique, but available before title. Typically, id is
96 something like "4234987", title "Dancing naked mole rats",
97 and display_id "dancing-naked-mole-rats"
98 thumbnails: A list of dictionaries, with the following entries:
100 * "width" (optional, int)
101 * "height" (optional, int)
102 * "resolution" (optional, string "{width}x{height"},
104 thumbnail: Full URL to a video thumbnail image.
105 description: One-line video description.
106 uploader: Full name of the video uploader.
107 timestamp: UNIX timestamp of the moment the video became available.
108 upload_date: Video upload date (YYYYMMDD).
109 If not explicitly set, calculated from timestamp.
110 uploader_id: Nickname or id of the video uploader.
111 location: Physical location of the video.
112 subtitles: The subtitle file contents as a dictionary in the format
113 {language: subtitles}.
114 duration: Length of the video in seconds, as an integer.
115 view_count: How many users have watched the video on the platform.
116 like_count: Number of positive ratings of the video
117 dislike_count: Number of negative ratings of the video
118 comment_count: Number of comments on the video
119 age_limit: Age restriction for the video, as an integer (years)
120 webpage_url: The url to the video webpage, if given to youtube-dl it
121 should allow to get the same result again. (It will be set
122 by YoutubeDL if it's missing)
123 categories: A list of categories that the video falls in, for example
126 Unless mentioned otherwise, the fields should be Unicode strings.
128 Subclasses of this one should re-define the _real_initialize() and
129 _real_extract() methods and define a _VALID_URL regexp.
130 Probably, they should also be added to the list of extractors.
132 Finally, the _WORKING attribute should be set to False for broken IEs
133 in order to warn the users and skip the tests.
140 def __init__(self
, downloader
=None):
141 """Constructor. Receives an optional downloader."""
143 self
.set_downloader(downloader
)
146 def suitable(cls
, url
):
147 """Receives a URL and returns True if suitable for this IE."""
149 # This does not use has/getattr intentionally - we want to know whether
150 # we have cached the regexp for *this* class, whereas getattr would also
151 # match the superclass
152 if '_VALID_URL_RE' not in cls
.__dict
__:
153 cls
._VALID
_URL
_RE
= re
.compile(cls
._VALID
_URL
)
154 return cls
._VALID
_URL
_RE
.match(url
) is not None
158 """Getter method for _WORKING."""
161 def initialize(self
):
162 """Initializes an instance (authentication, etc)."""
164 self
._real
_initialize
()
167 def extract(self
, url
):
168 """Extracts URL information and returns it in list of dicts."""
170 return self
._real
_extract
(url
)
172 def set_downloader(self
, downloader
):
173 """Sets the downloader for this IE."""
174 self
._downloader
= downloader
176 def _real_initialize(self
):
177 """Real initialization process. Redefine in subclasses."""
180 def _real_extract(self
, url
):
181 """Real extraction process. Redefine in subclasses."""
186 """A string for getting the InfoExtractor with get_info_extractor"""
187 return cls
.__name
__[:-2]
191 return type(self
).__name
__[:-2]
193 def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True):
194 """ Returns the response handle """
196 self
.report_download_webpage(video_id
)
197 elif note
is not False:
199 self
.to_screen(u
'%s' % (note
,))
201 self
.to_screen(u
'%s: %s' % (video_id
, note
))
203 return self
._downloader
.urlopen(url_or_request
)
204 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
208 errnote
= u
'Unable to download webpage'
209 errmsg
= u
'%s: %s' % (errnote
, compat_str(err
))
211 raise ExtractorError(errmsg
, sys
.exc_info()[2], cause
=err
)
213 self
._downloader
.report_warning(errmsg
)
216 def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True):
217 """ Returns a tuple (page content as string, URL handle) """
219 # Strip hashes from the URL (#1038)
220 if isinstance(url_or_request
, (compat_str
, str)):
221 url_or_request
= url_or_request
.partition('#')[0]
223 urlh
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
, fatal
)
227 content_type
= urlh
.headers
.get('Content-Type', '')
228 webpage_bytes
= urlh
.read()
229 m
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
)
231 encoding
= m
.group(1)
233 m
= re
.search(br
'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
234 webpage_bytes[:1024])
236 encoding = m.group(1).decode('ascii')
237 elif webpage_bytes.startswith(b'\xff\xfe'):
241 if self._downloader.params.get('dump_intermediate_pages', False):
243 url = url_or_request.get_full_url()
244 except AttributeError:
246 self.to_screen(u'Dumping request to ' + url)
247 dump = base64.b64encode(webpage_bytes).decode('ascii')
248 self._downloader.to_screen(dump)
249 if self._downloader.params.get('write_pages', False):
251 url = url_or_request.get_full_url()
252 except AttributeError:
254 basen = '%s_%s' % (video_id, url)
256 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
257 basen = basen[:240 - len(h)] + h
258 raw_filename = basen + '.dump'
259 filename = sanitize_filename(raw_filename, restricted=True)
260 self.to_screen(u'Saving request to ' + filename)
261 with open(filename, 'wb') as outf:
262 outf.write(webpage_bytes)
265 content = webpage_bytes.decode(encoding, 'replace')
267 content = webpage_bytes.decode('utf-8', 'replace')
269 if (u'<title>Access to this site is blocked</title>' in content and
270 u'Websense' in content[:512]):
271 msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
272 blocked_iframe = self._html_search_regex(
273 r'<iframe src="([^
"]+)"', content,
274 u'Websense information URL
', default=None)
276 msg += u' Visit
%s for more details
' % blocked_iframe
277 raise ExtractorError(msg, expected=True)
279 return (content, urlh)
281 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
282 """ Returns the data of the page as a string """
283 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
290 def _download_xml(self, url_or_request, video_id,
291 note=u'Downloading XML
', errnote=u'Unable to download XML
',
292 transform_source=None, fatal=True):
293 """Return the xml as an xml.etree.ElementTree.Element"""
294 xml_string = self._download_webpage(
295 url_or_request, video_id, note, errnote, fatal=fatal)
296 if xml_string is False:
299 xml_string = transform_source(xml_string)
300 return xml.etree.ElementTree.fromstring(xml_string.encode('utf
-8'))
302 def _download_json(self, url_or_request, video_id,
303 note=u'Downloading JSON metadata
',
304 errnote=u'Unable to download JSON metadata
',
305 transform_source=None,
307 json_string = self._download_webpage(
308 url_or_request, video_id, note, errnote, fatal=fatal)
309 if (not fatal) and json_string is False:
312 json_string = transform_source(json_string)
314 return json.loads(json_string)
315 except ValueError as ve:
316 raise ExtractorError('Failed to download JSON
', cause=ve)
318 def report_warning(self, msg, video_id=None):
319 idstr = u'' if video_id is None else u'%s: ' % video_id
320 self._downloader.report_warning(
321 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
323 def to_screen(self, msg):
324 """Print msg to screen, prefixing it with '[ie_name
]'"""
325 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
327 def report_extraction(self, id_or_name):
328 """Report information extraction."""
329 self.to_screen(u'%s: Extracting information
' % id_or_name)
331 def report_download_webpage(self, video_id):
332 """Report webpage download."""
333 self.to_screen(u'%s: Downloading webpage
' % video_id)
335 def report_age_confirmation(self):
336 """Report attempt to confirm age."""
337 self.to_screen(u'Confirming age
')
339 def report_login(self):
340 """Report attempt to log in."""
341 self.to_screen(u'Logging
in')
343 #Methods for following #608
345 def url_result(url, ie=None, video_id=None):
346 """Returns a url that points to a page that should be processed"""
347 #TODO: ie should be the class used for getting the info
348 video_info = {'_type
': 'url
',
351 if video_id is not None:
352 video_info['id'] = video_id
355 def playlist_result(entries, playlist_id=None, playlist_title=None):
356 """Returns a playlist"""
357 video_info = {'_type
': 'playlist
',
360 video_info['id'] = playlist_id
362 video_info['title
'] = playlist_title
365 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
367 Perform a regex search on the given string, using a single or a list of
368 patterns returning the first matching group.
369 In case of failure return a default value or raise a WARNING or a
370 RegexNotFoundError, depending on fatal, specifying the field name.
372 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
373 mobj = re.search(pattern, string, flags)
376 mobj = re.search(p, string, flags)
380 if os.name != 'nt
' and sys.stderr.isatty():
381 _name = u'\033[0;34m
%s\033[0m
' % name
386 # return the first matching group
387 return next(g for g in mobj.groups() if g is not None)
388 elif default is not _NO_DEFAULT:
391 raise RegexNotFoundError(u'Unable to extract
%s' % _name)
393 self._downloader.report_warning(u'unable to extract
%s; '
394 u'please report this issue on http
://yt
-dl
.org
/bug
' % _name)
397 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
399 Like _search_regex, but strips HTML tags and unescapes entities.
401 res = self._search_regex(pattern, string, name, default, fatal, flags)
403 return clean_html(res).strip()
407 def _get_login_info(self):
409 Get the the login info as (username, password)
410 It will look in the netrc file using the _NETRC_MACHINE value
411 If there's no info available
, return (None, None)
413 if self._downloader is None:
418 downloader_params = self._downloader.params
420 # Attempt to use provided username and password or .netrc data
421 if downloader_params.get('username', None) is not None:
422 username = downloader_params['username']
423 password = downloader_params['password']
424 elif downloader_params.get('usenetrc', False):
426 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
431 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
432 except (IOError, netrc.NetrcParseError) as err:
433 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
435 return (username, password)
437 def _get_tfa_info(self):
439 Get the two
-factor authentication info
440 TODO
- asking the user will be required
for sms
/phone verify
441 currently just uses the command line option
442 If there
's no info available, return None
444 if self._downloader is None:
446 downloader_params = self._downloader.params
448 if downloader_params.get('twofactor
', None) is not None:
449 return downloader_params['twofactor
']
453 # Helper functions for extracting OpenGraph info
455 def _og_regexes(prop):
456 content_re = r'content
=(?
:"([^>]+?)"|
\'([^
>]+?
)\')'
457 property_re = r'(?
:name|
property)=[\'"]og:%s[\'"]' % re.escape(prop)
458 template = r'<meta
[^
>]+?
%s[^
>]+?
%s'
460 template % (property_re, content_re),
461 template % (content_re, property_re),
464 def _og_search_property(self, prop, html, name=None, **kargs):
466 name = 'OpenGraph
%s' % prop
467 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
470 return unescapeHTML(escaped)
472 def _og_search_thumbnail(self, html, **kargs):
473 return self._og_search_property('image
', html, u'thumbnail url
', fatal=False, **kargs)
475 def _og_search_description(self, html, **kargs):
476 return self._og_search_property('description
', html, fatal=False, **kargs)
478 def _og_search_title(self, html, **kargs):
479 return self._og_search_property('title
', html, **kargs)
481 def _og_search_video_url(self, html, name='video url
', secure=True, **kargs):
482 regexes = self._og_regexes('video
')
483 if secure: regexes = self._og_regexes('video
:secure_url
') + regexes
484 return self._html_search_regex(regexes, html, name, **kargs)
486 def _og_search_url(self, html, **kargs):
487 return self._og_search_property('url
', html, **kargs)
489 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
490 if display_name is None:
492 return self._html_search_regex(
494 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
495 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
496 html, display_name, fatal=fatal, **kwargs)
498 def _dc_search_uploader(self, html):
499 return self._html_search_meta('dc
.creator
', html, 'uploader
')
501 def _rta_search(self, html):
502 # See http://www.rtalabel.org/index.php?content=howtofaq#single
503 if re.search(r'(?ix
)<meta\s
+name
="rating"\s
+'
504 r' content
="RTA-5042-1996-1400-1577-RTA"',
509 def _media_rating_search(self, html):
510 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
511 rating = self._html_search_meta('rating
', html)
523 return RATING_TABLE.get(rating.lower(), None)
525 def _twitter_search_player(self, html):
526 return self._html_search_meta('twitter
:player
', html,
527 'twitter card player
')
529 def _sort_formats(self, formats):
531 raise ExtractorError(u'No video formats found
')
534 # TODO remove the following workaround
535 from ..utils import determine_ext
536 if not f.get('ext
') and 'url
' in f:
537 f['ext
'] = determine_ext(f['url
'])
539 preference = f.get('preference
')
540 if preference is None:
541 proto = f.get('protocol
')
543 proto = compat_urllib_parse_urlparse(f.get('url
', '')).scheme
545 preference = 0 if proto in ['http
', 'https
'] else -0.1
546 if f.get('ext
') in ['f4f
', 'f4m
']: # Not yet supported
549 if f.get('vcodec
') == 'none
': # audio only
550 if self._downloader.params.get('prefer_free_formats
'):
551 ORDER = [u'aac
', u'mp3
', u'm4a
', u'webm
', u'ogg
', u'opus
']
553 ORDER = [u'webm
', u'opus
', u'ogg
', u'mp3
', u'aac
', u'm4a
']
556 audio_ext_preference = ORDER.index(f['ext
'])
558 audio_ext_preference = -1
560 if self._downloader.params.get('prefer_free_formats
'):
561 ORDER = [u'flv
', u'mp4
', u'webm
']
563 ORDER = [u'webm
', u'flv
', u'mp4
']
565 ext_preference = ORDER.index(f['ext
'])
568 audio_ext_preference = 0
572 f.get('quality
') if f.get('quality
') is not None else -1,
573 f.get('height
') if f.get('height
') is not None else -1,
574 f.get('width
') if f.get('width
') is not None else -1,
576 f.get('tbr
') if f.get('tbr
') is not None else -1,
577 f.get('vbr
') if f.get('vbr
') is not None else -1,
578 f.get('abr
') if f.get('abr
') is not None else -1,
579 audio_ext_preference,
580 f.get('filesize
') if f.get('filesize
') is not None else -1,
581 f.get('filesize_approx
') if f.get('filesize_approx
') is not None else -1,
584 formats.sort(key=_formats_key)
586 def http_scheme(self):
587 """ Either "https:" or "https:", depending on the user's preferences
"""
590 if self._downloader.params.get('prefer_insecure', False)
593 def _proto_relative_url(self, url, scheme=None):
596 if url.startswith('//'):
598 scheme = self.http_scheme()
603 def _sleep(self, timeout, video_id, msg_template=None):
604 if msg_template is None:
605 msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds'
606 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
610 def _extract_f4m_formats(self, manifest_url, video_id):
611 manifest = self._download_xml(
612 manifest_url, video_id, 'Downloading f4m manifest',
613 'Unable to download f4m manifest')
616 for media_el in manifest.findall('{http://ns.adobe.com/f4m/1.0}media'):
620 'tbr': int_or_none(media_el.attrib.get('bitrate')),
621 'width': int_or_none(media_el.attrib.get('width')),
622 'height': int_or_none(media_el.attrib.get('height')),
624 self._sort_formats(formats)
629 class SearchInfoExtractor(InfoExtractor):
631 Base
class for paged search queries extractors
.
632 They accept urls
in the format
_SEARCH_KEY(|all|
[0-9]):{query}
633 Instances should define _SEARCH_KEY
and _MAX_RESULTS
.
637 def _make_valid_url(cls):
638 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
641 def suitable(cls, url):
642 return re.match(cls._make_valid_url(), url) is not None
644 def _real_extract(self, query):
645 mobj = re.match(self._make_valid_url(), query)
647 raise ExtractorError(u'Invalid search query "%s"' % query)
649 prefix = mobj.group('prefix')
650 query = mobj.group('query')
652 return self._get_n_results(query, 1)
653 elif prefix == 'all':
654 return self._get_n_results(query, self._MAX_RESULTS)
658 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
659 elif n > self._MAX_RESULTS:
660 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
661 n = self._MAX_RESULTS
662 return self._get_n_results(query, n)
664 def _get_n_results(self, query, n):
665 """Get a specified number of results
for a query
"""
666 raise NotImplementedError("This method must be implemented by subclasses")
669 def SEARCH_KEY(self):
670 return self._SEARCH_KEY