]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
Add an extractor for cmt.com (closes #2049)
[yt-dlp.git] / youtube_dl / extractor / common.py
1 import base64
2 import os
3 import re
4 import socket
5 import sys
6 import netrc
7 import xml.etree.ElementTree
8
9 from ..utils import (
10 compat_http_client,
11 compat_urllib_error,
12 compat_urllib_parse_urlparse,
13 compat_str,
14
15 clean_html,
16 compiled_regex_type,
17 ExtractorError,
18 RegexNotFoundError,
19 sanitize_filename,
20 unescapeHTML,
21 )
22 _NO_DEFAULT = object()
23
24
25 class InfoExtractor(object):
26 """Information Extractor class.
27
28 Information extractors are the classes that, given a URL, extract
29 information about the video (or videos) the URL refers to. This
30 information includes the real video URL, the video title, author and
31 others. The information is stored in a dictionary which is then
32 passed to the FileDownloader. The FileDownloader processes this
33 information possibly downloading the video to the file system, among
34 other possible outcomes.
35
36 The dictionaries must include the following fields:
37
38 id: Video identifier.
39 title: Video title, unescaped.
40
41 Additionally, it must contain either a formats entry or a url one:
42
43 formats: A list of dictionaries for each format available, ordered
44 from worst to best quality.
45
46 Potential fields:
47 * url Mandatory. The URL of the video file
48 * ext Will be calculated from url if missing
49 * format A human-readable description of the format
50 ("mp4 container with h264/opus").
51 Calculated from the format_id, width, height.
52 and format_note fields if missing.
53 * format_id A short description of the format
54 ("mp4_h264_opus" or "19").
55 Technically optional, but strongly recommended.
56 * format_note Additional info about the format
57 ("3D" or "DASH video")
58 * width Width of the video, if known
59 * height Height of the video, if known
60 * resolution Textual description of width and height
61 * tbr Average bitrate of audio and video in KBit/s
62 * abr Average audio bitrate in KBit/s
63 * acodec Name of the audio codec in use
64 * vbr Average video bitrate in KBit/s
65 * vcodec Name of the video codec in use
66 * filesize The number of bytes, if known in advance
67 * player_url SWF Player URL (used for rtmpdump).
68 * protocol The protocol that will be used for the actual
69 download, lower-case.
70 "http", "https", "rtsp", "rtmp" or so.
71 * preference Order number of this format. If this field is
72 present, the formats get sorted by this field.
73 -1 for default (order by other properties),
74 -2 or smaller for less than default.
75 url: Final video URL.
76 ext: Video filename extension.
77 format: The video format, defaults to ext (used for --get-format)
78 player_url: SWF Player URL (used for rtmpdump).
79
80 The following fields are optional:
81
82 thumbnails: A list of dictionaries (with the entries "resolution" and
83 "url") for the varying thumbnails
84 thumbnail: Full URL to a video thumbnail image.
85 description: One-line video description.
86 uploader: Full name of the video uploader.
87 upload_date: Video upload date (YYYYMMDD).
88 uploader_id: Nickname or id of the video uploader.
89 location: Physical location of the video.
90 subtitles: The subtitle file contents as a dictionary in the format
91 {language: subtitles}.
92 duration: Length of the video in seconds, as an integer.
93 view_count: How many users have watched the video on the platform.
94 like_count: Number of positive ratings of the video
95 dislike_count: Number of negative ratings of the video
96 comment_count: Number of comments on the video
97 age_limit: Age restriction for the video, as an integer (years)
98 webpage_url: The url to the video webpage, if given to youtube-dl it
99 should allow to get the same result again. (It will be set
100 by YoutubeDL if it's missing)
101
102 Unless mentioned otherwise, the fields should be Unicode strings.
103
104 Subclasses of this one should re-define the _real_initialize() and
105 _real_extract() methods and define a _VALID_URL regexp.
106 Probably, they should also be added to the list of extractors.
107
108 _real_extract() must return a *list* of information dictionaries as
109 described above.
110
111 Finally, the _WORKING attribute should be set to False for broken IEs
112 in order to warn the users and skip the tests.
113 """
114
115 _ready = False
116 _downloader = None
117 _WORKING = True
118
119 def __init__(self, downloader=None):
120 """Constructor. Receives an optional downloader."""
121 self._ready = False
122 self.set_downloader(downloader)
123
124 @classmethod
125 def suitable(cls, url):
126 """Receives a URL and returns True if suitable for this IE."""
127
128 # This does not use has/getattr intentionally - we want to know whether
129 # we have cached the regexp for *this* class, whereas getattr would also
130 # match the superclass
131 if '_VALID_URL_RE' not in cls.__dict__:
132 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
133 return cls._VALID_URL_RE.match(url) is not None
134
135 @classmethod
136 def working(cls):
137 """Getter method for _WORKING."""
138 return cls._WORKING
139
140 def initialize(self):
141 """Initializes an instance (authentication, etc)."""
142 if not self._ready:
143 self._real_initialize()
144 self._ready = True
145
146 def extract(self, url):
147 """Extracts URL information and returns it in list of dicts."""
148 self.initialize()
149 return self._real_extract(url)
150
151 def set_downloader(self, downloader):
152 """Sets the downloader for this IE."""
153 self._downloader = downloader
154
155 def _real_initialize(self):
156 """Real initialization process. Redefine in subclasses."""
157 pass
158
159 def _real_extract(self, url):
160 """Real extraction process. Redefine in subclasses."""
161 pass
162
163 @classmethod
164 def ie_key(cls):
165 """A string for getting the InfoExtractor with get_info_extractor"""
166 return cls.__name__[:-2]
167
168 @property
169 def IE_NAME(self):
170 return type(self).__name__[:-2]
171
172 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
173 """ Returns the response handle """
174 if note is None:
175 self.report_download_webpage(video_id)
176 elif note is not False:
177 if video_id is None:
178 self.to_screen(u'%s' % (note,))
179 else:
180 self.to_screen(u'%s: %s' % (video_id, note))
181 try:
182 return self._downloader.urlopen(url_or_request)
183 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
184 if errnote is False:
185 return False
186 if errnote is None:
187 errnote = u'Unable to download webpage'
188 errmsg = u'%s: %s' % (errnote, compat_str(err))
189 if fatal:
190 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
191 else:
192 self._downloader.report_warning(errmsg)
193 return False
194
195 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
196 """ Returns a tuple (page content as string, URL handle) """
197
198 # Strip hashes from the URL (#1038)
199 if isinstance(url_or_request, (compat_str, str)):
200 url_or_request = url_or_request.partition('#')[0]
201
202 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
203 if urlh is False:
204 assert not fatal
205 return False
206 content_type = urlh.headers.get('Content-Type', '')
207 webpage_bytes = urlh.read()
208 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
209 if m:
210 encoding = m.group(1)
211 else:
212 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
213 webpage_bytes[:1024])
214 if m:
215 encoding = m.group(1).decode('ascii')
216 else:
217 encoding = 'utf-8'
218 if self._downloader.params.get('dump_intermediate_pages', False):
219 try:
220 url = url_or_request.get_full_url()
221 except AttributeError:
222 url = url_or_request
223 self.to_screen(u'Dumping request to ' + url)
224 dump = base64.b64encode(webpage_bytes).decode('ascii')
225 self._downloader.to_screen(dump)
226 if self._downloader.params.get('write_pages', False):
227 try:
228 url = url_or_request.get_full_url()
229 except AttributeError:
230 url = url_or_request
231 raw_filename = ('%s_%s.dump' % (video_id, url))
232 filename = sanitize_filename(raw_filename, restricted=True)
233 self.to_screen(u'Saving request to ' + filename)
234 with open(filename, 'wb') as outf:
235 outf.write(webpage_bytes)
236
237 content = webpage_bytes.decode(encoding, 'replace')
238 return (content, urlh)
239
240 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
241 """ Returns the data of the page as a string """
242 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
243 if res is False:
244 return res
245 else:
246 content, _ = res
247 return content
248
249 def _download_xml(self, url_or_request, video_id,
250 note=u'Downloading XML', errnote=u'Unable to download XML',
251 transform_source=None):
252 """Return the xml as an xml.etree.ElementTree.Element"""
253 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
254 if transform_source:
255 xml_string = transform_source(xml_string)
256 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
257
258 def report_warning(self, msg, video_id=None):
259 idstr = u'' if video_id is None else u'%s: ' % video_id
260 self._downloader.report_warning(
261 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
262
263 def to_screen(self, msg):
264 """Print msg to screen, prefixing it with '[ie_name]'"""
265 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
266
267 def report_extraction(self, id_or_name):
268 """Report information extraction."""
269 self.to_screen(u'%s: Extracting information' % id_or_name)
270
271 def report_download_webpage(self, video_id):
272 """Report webpage download."""
273 self.to_screen(u'%s: Downloading webpage' % video_id)
274
275 def report_age_confirmation(self):
276 """Report attempt to confirm age."""
277 self.to_screen(u'Confirming age')
278
279 def report_login(self):
280 """Report attempt to log in."""
281 self.to_screen(u'Logging in')
282
283 #Methods for following #608
284 @staticmethod
285 def url_result(url, ie=None, video_id=None):
286 """Returns a url that points to a page that should be processed"""
287 #TODO: ie should be the class used for getting the info
288 video_info = {'_type': 'url',
289 'url': url,
290 'ie_key': ie}
291 if video_id is not None:
292 video_info['id'] = video_id
293 return video_info
294 @staticmethod
295 def playlist_result(entries, playlist_id=None, playlist_title=None):
296 """Returns a playlist"""
297 video_info = {'_type': 'playlist',
298 'entries': entries}
299 if playlist_id:
300 video_info['id'] = playlist_id
301 if playlist_title:
302 video_info['title'] = playlist_title
303 return video_info
304
305 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
306 """
307 Perform a regex search on the given string, using a single or a list of
308 patterns returning the first matching group.
309 In case of failure return a default value or raise a WARNING or a
310 RegexNotFoundError, depending on fatal, specifying the field name.
311 """
312 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
313 mobj = re.search(pattern, string, flags)
314 else:
315 for p in pattern:
316 mobj = re.search(p, string, flags)
317 if mobj: break
318
319 if os.name != 'nt' and sys.stderr.isatty():
320 _name = u'\033[0;34m%s\033[0m' % name
321 else:
322 _name = name
323
324 if mobj:
325 # return the first matching group
326 return next(g for g in mobj.groups() if g is not None)
327 elif default is not _NO_DEFAULT:
328 return default
329 elif fatal:
330 raise RegexNotFoundError(u'Unable to extract %s' % _name)
331 else:
332 self._downloader.report_warning(u'unable to extract %s; '
333 u'please report this issue on http://yt-dl.org/bug' % _name)
334 return None
335
336 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
337 """
338 Like _search_regex, but strips HTML tags and unescapes entities.
339 """
340 res = self._search_regex(pattern, string, name, default, fatal, flags)
341 if res:
342 return clean_html(res).strip()
343 else:
344 return res
345
346 def _get_login_info(self):
347 """
348 Get the the login info as (username, password)
349 It will look in the netrc file using the _NETRC_MACHINE value
350 If there's no info available, return (None, None)
351 """
352 if self._downloader is None:
353 return (None, None)
354
355 username = None
356 password = None
357 downloader_params = self._downloader.params
358
359 # Attempt to use provided username and password or .netrc data
360 if downloader_params.get('username', None) is not None:
361 username = downloader_params['username']
362 password = downloader_params['password']
363 elif downloader_params.get('usenetrc', False):
364 try:
365 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
366 if info is not None:
367 username = info[0]
368 password = info[2]
369 else:
370 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
371 except (IOError, netrc.NetrcParseError) as err:
372 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
373
374 return (username, password)
375
376 # Helper functions for extracting OpenGraph info
377 @staticmethod
378 def _og_regexes(prop):
379 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
380 property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
381 template = r'<meta[^>]+?%s[^>]+?%s'
382 return [
383 template % (property_re, content_re),
384 template % (content_re, property_re),
385 ]
386
387 def _og_search_property(self, prop, html, name=None, **kargs):
388 if name is None:
389 name = 'OpenGraph %s' % prop
390 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
391 if escaped is None:
392 return None
393 return unescapeHTML(escaped)
394
395 def _og_search_thumbnail(self, html, **kargs):
396 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
397
398 def _og_search_description(self, html, **kargs):
399 return self._og_search_property('description', html, fatal=False, **kargs)
400
401 def _og_search_title(self, html, **kargs):
402 return self._og_search_property('title', html, **kargs)
403
404 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
405 regexes = self._og_regexes('video')
406 if secure: regexes = self._og_regexes('video:secure_url') + regexes
407 return self._html_search_regex(regexes, html, name, **kargs)
408
409 def _html_search_meta(self, name, html, display_name=None):
410 if display_name is None:
411 display_name = name
412 return self._html_search_regex(
413 r'''(?ix)<meta
414 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
415 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
416 html, display_name, fatal=False)
417
418 def _dc_search_uploader(self, html):
419 return self._html_search_meta('dc.creator', html, 'uploader')
420
421 def _rta_search(self, html):
422 # See http://www.rtalabel.org/index.php?content=howtofaq#single
423 if re.search(r'(?ix)<meta\s+name="rating"\s+'
424 r' content="RTA-5042-1996-1400-1577-RTA"',
425 html):
426 return 18
427 return 0
428
429 def _media_rating_search(self, html):
430 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
431 rating = self._html_search_meta('rating', html)
432
433 if not rating:
434 return None
435
436 RATING_TABLE = {
437 'safe for kids': 0,
438 'general': 8,
439 '14 years': 14,
440 'mature': 17,
441 'restricted': 19,
442 }
443 return RATING_TABLE.get(rating.lower(), None)
444
445 def _sort_formats(self, formats):
446 def _formats_key(f):
447 # TODO remove the following workaround
448 from ..utils import determine_ext
449 if not f.get('ext') and 'url' in f:
450 f['ext'] = determine_ext(f['url'])
451
452 preference = f.get('preference')
453 if preference is None:
454 proto = f.get('protocol')
455 if proto is None:
456 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
457
458 preference = 0 if proto in ['http', 'https'] else -0.1
459 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
460 preference -= 0.5
461
462 if f.get('vcodec') == 'none': # audio only
463 if self._downloader.params.get('prefer_free_formats'):
464 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
465 else:
466 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
467 ext_preference = 0
468 try:
469 audio_ext_preference = ORDER.index(f['ext'])
470 except ValueError:
471 audio_ext_preference = -1
472 else:
473 if self._downloader.params.get('prefer_free_formats'):
474 ORDER = [u'flv', u'mp4', u'webm']
475 else:
476 ORDER = [u'webm', u'flv', u'mp4']
477 try:
478 ext_preference = ORDER.index(f['ext'])
479 except ValueError:
480 ext_preference = -1
481 audio_ext_preference = 0
482
483 return (
484 preference,
485 f.get('height') if f.get('height') is not None else -1,
486 f.get('width') if f.get('width') is not None else -1,
487 ext_preference,
488 f.get('vbr') if f.get('vbr') is not None else -1,
489 f.get('abr') if f.get('abr') is not None else -1,
490 audio_ext_preference,
491 f.get('filesize') if f.get('filesize') is not None else -1,
492 f.get('format_id'),
493 )
494 formats.sort(key=_formats_key)
495
496
497 class SearchInfoExtractor(InfoExtractor):
498 """
499 Base class for paged search queries extractors.
500 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
501 Instances should define _SEARCH_KEY and _MAX_RESULTS.
502 """
503
504 @classmethod
505 def _make_valid_url(cls):
506 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
507
508 @classmethod
509 def suitable(cls, url):
510 return re.match(cls._make_valid_url(), url) is not None
511
512 def _real_extract(self, query):
513 mobj = re.match(self._make_valid_url(), query)
514 if mobj is None:
515 raise ExtractorError(u'Invalid search query "%s"' % query)
516
517 prefix = mobj.group('prefix')
518 query = mobj.group('query')
519 if prefix == '':
520 return self._get_n_results(query, 1)
521 elif prefix == 'all':
522 return self._get_n_results(query, self._MAX_RESULTS)
523 else:
524 n = int(prefix)
525 if n <= 0:
526 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
527 elif n > self._MAX_RESULTS:
528 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
529 n = self._MAX_RESULTS
530 return self._get_n_results(query, n)
531
532 def _get_n_results(self, query, n):
533 """Get a specified number of results for a query"""
534 raise NotImplementedError("This method must be implemented by subclasses")
535
536 @property
537 def SEARCH_KEY(self):
538 return self._SEARCH_KEY