]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
Add an extractor for cmt.com (closes #2049)
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
d6983cb4
PH
1import base64
2import os
3import re
4import socket
5import sys
fc79158d 6import netrc
267ed0c5 7import xml.etree.ElementTree
d6983cb4
PH
8
9from ..utils import (
10 compat_http_client,
11 compat_urllib_error,
c7deaa4c 12 compat_urllib_parse_urlparse,
d6983cb4
PH
13 compat_str,
14
15 clean_html,
16 compiled_regex_type,
17 ExtractorError,
55b3e45b 18 RegexNotFoundError,
d41e6efc 19 sanitize_filename,
f38de77f 20 unescapeHTML,
d6983cb4 21)
46374a56 22_NO_DEFAULT = object()
d6983cb4 23
dca08720 24
d6983cb4
PH
25class InfoExtractor(object):
26 """Information Extractor class.
27
28 Information extractors are the classes that, given a URL, extract
29 information about the video (or videos) the URL refers to. This
30 information includes the real video URL, the video title, author and
31 others. The information is stored in a dictionary which is then
32 passed to the FileDownloader. The FileDownloader processes this
33 information possibly downloading the video to the file system, among
34 other possible outcomes.
35
36 The dictionaries must include the following fields:
37
38 id: Video identifier.
d6983cb4 39 title: Video title, unescaped.
d67b0b15 40
f49d89ee 41 Additionally, it must contain either a formats entry or a url one:
d67b0b15 42
f49d89ee
PH
43 formats: A list of dictionaries for each format available, ordered
44 from worst to best quality.
45
46 Potential fields:
d67b0b15
PH
47 * url Mandatory. The URL of the video file
48 * ext Will be calculated from url if missing
49 * format A human-readable description of the format
50 ("mp4 container with h264/opus").
51 Calculated from the format_id, width, height.
52 and format_note fields if missing.
53 * format_id A short description of the format
5d4f3985
PH
54 ("mp4_h264_opus" or "19").
55 Technically optional, but strongly recommended.
d67b0b15
PH
56 * format_note Additional info about the format
57 ("3D" or "DASH video")
58 * width Width of the video, if known
59 * height Height of the video, if known
f49d89ee 60 * resolution Textual description of width and height
7217e148 61 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
62 * abr Average audio bitrate in KBit/s
63 * acodec Name of the audio codec in use
64 * vbr Average video bitrate in KBit/s
65 * vcodec Name of the video codec in use
66 * filesize The number of bytes, if known in advance
67 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
68 * protocol The protocol that will be used for the actual
69 download, lower-case.
70 "http", "https", "rtsp", "rtmp" or so.
f49d89ee
PH
71 * preference Order number of this format. If this field is
72 present, the formats get sorted by this field.
73 -1 for default (order by other properties),
74 -2 or smaller for less than default.
c0ba0f48 75 url: Final video URL.
d6983cb4 76 ext: Video filename extension.
d67b0b15
PH
77 format: The video format, defaults to ext (used for --get-format)
78 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 79
d6983cb4
PH
80 The following fields are optional:
81
73e79f2a
PH
82 thumbnails: A list of dictionaries (with the entries "resolution" and
83 "url") for the varying thumbnails
d6983cb4
PH
84 thumbnail: Full URL to a video thumbnail image.
85 description: One-line video description.
86 uploader: Full name of the video uploader.
87 upload_date: Video upload date (YYYYMMDD).
88 uploader_id: Nickname or id of the video uploader.
89 location: Physical location of the video.
5d51a883
JMF
90 subtitles: The subtitle file contents as a dictionary in the format
91 {language: subtitles}.
c0ba0f48 92 duration: Length of the video in seconds, as an integer.
f3d29461 93 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
94 like_count: Number of positive ratings of the video
95 dislike_count: Number of negative ratings of the video
96 comment_count: Number of comments on the video
8dbe9899 97 age_limit: Age restriction for the video, as an integer (years)
9103bbc5
JMF
98 webpage_url: The url to the video webpage, if given to youtube-dl it
99 should allow to get the same result again. (It will be set
100 by YoutubeDL if it's missing)
d6983cb4 101
deefc05b 102 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4
PH
103
104 Subclasses of this one should re-define the _real_initialize() and
105 _real_extract() methods and define a _VALID_URL regexp.
106 Probably, they should also be added to the list of extractors.
107
108 _real_extract() must return a *list* of information dictionaries as
109 described above.
110
111 Finally, the _WORKING attribute should be set to False for broken IEs
112 in order to warn the users and skip the tests.
113 """
114
115 _ready = False
116 _downloader = None
117 _WORKING = True
118
119 def __init__(self, downloader=None):
120 """Constructor. Receives an optional downloader."""
121 self._ready = False
122 self.set_downloader(downloader)
123
124 @classmethod
125 def suitable(cls, url):
126 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
127
128 # This does not use has/getattr intentionally - we want to know whether
129 # we have cached the regexp for *this* class, whereas getattr would also
130 # match the superclass
131 if '_VALID_URL_RE' not in cls.__dict__:
132 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
133 return cls._VALID_URL_RE.match(url) is not None
d6983cb4
PH
134
135 @classmethod
136 def working(cls):
137 """Getter method for _WORKING."""
138 return cls._WORKING
139
140 def initialize(self):
141 """Initializes an instance (authentication, etc)."""
142 if not self._ready:
143 self._real_initialize()
144 self._ready = True
145
146 def extract(self, url):
147 """Extracts URL information and returns it in list of dicts."""
148 self.initialize()
149 return self._real_extract(url)
150
151 def set_downloader(self, downloader):
152 """Sets the downloader for this IE."""
153 self._downloader = downloader
154
155 def _real_initialize(self):
156 """Real initialization process. Redefine in subclasses."""
157 pass
158
159 def _real_extract(self, url):
160 """Real extraction process. Redefine in subclasses."""
161 pass
162
56c73665
JMF
163 @classmethod
164 def ie_key(cls):
165 """A string for getting the InfoExtractor with get_info_extractor"""
166 return cls.__name__[:-2]
167
d6983cb4
PH
168 @property
169 def IE_NAME(self):
170 return type(self).__name__[:-2]
171
7cc3570e 172 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4
PH
173 """ Returns the response handle """
174 if note is None:
175 self.report_download_webpage(video_id)
176 elif note is not False:
7cc3570e
PH
177 if video_id is None:
178 self.to_screen(u'%s' % (note,))
179 else:
180 self.to_screen(u'%s: %s' % (video_id, note))
d6983cb4 181 try:
dca08720 182 return self._downloader.urlopen(url_or_request)
d6983cb4 183 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3
PH
184 if errnote is False:
185 return False
d6983cb4
PH
186 if errnote is None:
187 errnote = u'Unable to download webpage'
7cc3570e
PH
188 errmsg = u'%s: %s' % (errnote, compat_str(err))
189 if fatal:
190 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
191 else:
192 self._downloader.report_warning(errmsg)
193 return False
d6983cb4 194
7cc3570e 195 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 196 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
197
198 # Strip hashes from the URL (#1038)
199 if isinstance(url_or_request, (compat_str, str)):
200 url_or_request = url_or_request.partition('#')[0]
201
7cc3570e
PH
202 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
203 if urlh is False:
204 assert not fatal
205 return False
d6983cb4 206 content_type = urlh.headers.get('Content-Type', '')
f143d86a 207 webpage_bytes = urlh.read()
d6983cb4
PH
208 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
209 if m:
210 encoding = m.group(1)
211 else:
0d75ae2c 212 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
213 webpage_bytes[:1024])
214 if m:
215 encoding = m.group(1).decode('ascii')
216 else:
217 encoding = 'utf-8'
d6983cb4
PH
218 if self._downloader.params.get('dump_intermediate_pages', False):
219 try:
220 url = url_or_request.get_full_url()
221 except AttributeError:
222 url = url_or_request
223 self.to_screen(u'Dumping request to ' + url)
224 dump = base64.b64encode(webpage_bytes).decode('ascii')
225 self._downloader.to_screen(dump)
d41e6efc
PH
226 if self._downloader.params.get('write_pages', False):
227 try:
228 url = url_or_request.get_full_url()
229 except AttributeError:
230 url = url_or_request
231 raw_filename = ('%s_%s.dump' % (video_id, url))
232 filename = sanitize_filename(raw_filename, restricted=True)
233 self.to_screen(u'Saving request to ' + filename)
234 with open(filename, 'wb') as outf:
235 outf.write(webpage_bytes)
236
d6983cb4
PH
237 content = webpage_bytes.decode(encoding, 'replace')
238 return (content, urlh)
239
7cc3570e 240 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 241 """ Returns the data of the page as a string """
7cc3570e
PH
242 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
243 if res is False:
244 return res
245 else:
246 content, _ = res
247 return content
d6983cb4 248
2a275ab0 249 def _download_xml(self, url_or_request, video_id,
e2b38da9
PH
250 note=u'Downloading XML', errnote=u'Unable to download XML',
251 transform_source=None):
267ed0c5
JMF
252 """Return the xml as an xml.etree.ElementTree.Element"""
253 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
e2b38da9
PH
254 if transform_source:
255 xml_string = transform_source(xml_string)
267ed0c5
JMF
256 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
257
f45f96f8
PH
258 def report_warning(self, msg, video_id=None):
259 idstr = u'' if video_id is None else u'%s: ' % video_id
260 self._downloader.report_warning(
261 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
262
d6983cb4
PH
263 def to_screen(self, msg):
264 """Print msg to screen, prefixing it with '[ie_name]'"""
265 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
266
267 def report_extraction(self, id_or_name):
268 """Report information extraction."""
269 self.to_screen(u'%s: Extracting information' % id_or_name)
270
271 def report_download_webpage(self, video_id):
272 """Report webpage download."""
273 self.to_screen(u'%s: Downloading webpage' % video_id)
274
275 def report_age_confirmation(self):
276 """Report attempt to confirm age."""
277 self.to_screen(u'Confirming age')
278
fc79158d
JMF
279 def report_login(self):
280 """Report attempt to log in."""
281 self.to_screen(u'Logging in')
282
d6983cb4 283 #Methods for following #608
c0d0b01f
JMF
284 @staticmethod
285 def url_result(url, ie=None, video_id=None):
d6983cb4
PH
286 """Returns a url that points to a page that should be processed"""
287 #TODO: ie should be the class used for getting the info
288 video_info = {'_type': 'url',
289 'url': url,
290 'ie_key': ie}
7012b23c
PH
291 if video_id is not None:
292 video_info['id'] = video_id
d6983cb4 293 return video_info
c0d0b01f
JMF
294 @staticmethod
295 def playlist_result(entries, playlist_id=None, playlist_title=None):
d6983cb4
PH
296 """Returns a playlist"""
297 video_info = {'_type': 'playlist',
298 'entries': entries}
299 if playlist_id:
300 video_info['id'] = playlist_id
301 if playlist_title:
302 video_info['title'] = playlist_title
303 return video_info
304
46374a56 305 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
d6983cb4
PH
306 """
307 Perform a regex search on the given string, using a single or a list of
308 patterns returning the first matching group.
309 In case of failure return a default value or raise a WARNING or a
55b3e45b 310 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
311 """
312 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
313 mobj = re.search(pattern, string, flags)
314 else:
315 for p in pattern:
316 mobj = re.search(p, string, flags)
317 if mobj: break
318
87a28127 319 if os.name != 'nt' and sys.stderr.isatty():
d6983cb4
PH
320 _name = u'\033[0;34m%s\033[0m' % name
321 else:
322 _name = name
323
324 if mobj:
325 # return the first matching group
326 return next(g for g in mobj.groups() if g is not None)
46374a56 327 elif default is not _NO_DEFAULT:
d6983cb4
PH
328 return default
329 elif fatal:
55b3e45b 330 raise RegexNotFoundError(u'Unable to extract %s' % _name)
d6983cb4
PH
331 else:
332 self._downloader.report_warning(u'unable to extract %s; '
98bcd283 333 u'please report this issue on http://yt-dl.org/bug' % _name)
d6983cb4
PH
334 return None
335
46374a56 336 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
d6983cb4
PH
337 """
338 Like _search_regex, but strips HTML tags and unescapes entities.
339 """
340 res = self._search_regex(pattern, string, name, default, fatal, flags)
341 if res:
342 return clean_html(res).strip()
343 else:
344 return res
345
fc79158d
JMF
346 def _get_login_info(self):
347 """
348 Get the the login info as (username, password)
349 It will look in the netrc file using the _NETRC_MACHINE value
350 If there's no info available, return (None, None)
351 """
352 if self._downloader is None:
353 return (None, None)
354
355 username = None
356 password = None
357 downloader_params = self._downloader.params
358
359 # Attempt to use provided username and password or .netrc data
360 if downloader_params.get('username', None) is not None:
361 username = downloader_params['username']
362 password = downloader_params['password']
363 elif downloader_params.get('usenetrc', False):
364 try:
365 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
366 if info is not None:
367 username = info[0]
368 password = info[2]
369 else:
370 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
371 except (IOError, netrc.NetrcParseError) as err:
372 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
373
374 return (username, password)
375
46720279
JMF
376 # Helper functions for extracting OpenGraph info
377 @staticmethod
ab2d5247 378 def _og_regexes(prop):
78fb87b2
JMF
379 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
380 property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
381 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 382 return [
78fb87b2
JMF
383 template % (property_re, content_re),
384 template % (content_re, property_re),
ab2d5247 385 ]
46720279 386
3c4e6d83 387 def _og_search_property(self, prop, html, name=None, **kargs):
46720279 388 if name is None:
3c4e6d83 389 name = 'OpenGraph %s' % prop
ab2d5247 390 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
391 if escaped is None:
392 return None
393 return unescapeHTML(escaped)
46720279
JMF
394
395 def _og_search_thumbnail(self, html, **kargs):
3c4e6d83 396 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
46720279
JMF
397
398 def _og_search_description(self, html, **kargs):
399 return self._og_search_property('description', html, fatal=False, **kargs)
400
401 def _og_search_title(self, html, **kargs):
402 return self._og_search_property('title', html, **kargs)
403
8ffa13e0 404 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
ab2d5247
JMF
405 regexes = self._og_regexes('video')
406 if secure: regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 407 return self._html_search_regex(regexes, html, name, **kargs)
46720279 408
59040888
PH
409 def _html_search_meta(self, name, html, display_name=None):
410 if display_name is None:
411 display_name = name
412 return self._html_search_regex(
aaebed13
PH
413 r'''(?ix)<meta
414 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
59040888
PH
415 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
416 html, display_name, fatal=False)
417
418 def _dc_search_uploader(self, html):
419 return self._html_search_meta('dc.creator', html, 'uploader')
420
8dbe9899
PH
421 def _rta_search(self, html):
422 # See http://www.rtalabel.org/index.php?content=howtofaq#single
423 if re.search(r'(?ix)<meta\s+name="rating"\s+'
424 r' content="RTA-5042-1996-1400-1577-RTA"',
425 html):
426 return 18
427 return 0
428
59040888
PH
429 def _media_rating_search(self, html):
430 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
431 rating = self._html_search_meta('rating', html)
432
433 if not rating:
434 return None
435
436 RATING_TABLE = {
437 'safe for kids': 0,
438 'general': 8,
439 '14 years': 14,
440 'mature': 17,
441 'restricted': 19,
442 }
443 return RATING_TABLE.get(rating.lower(), None)
444
4bcc7bd1
PH
445 def _sort_formats(self, formats):
446 def _formats_key(f):
e6812ac9
PH
447 # TODO remove the following workaround
448 from ..utils import determine_ext
449 if not f.get('ext') and 'url' in f:
450 f['ext'] = determine_ext(f['url'])
451
4bcc7bd1
PH
452 preference = f.get('preference')
453 if preference is None:
c7deaa4c
PH
454 proto = f.get('protocol')
455 if proto is None:
456 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
457
458 preference = 0 if proto in ['http', 'https'] else -0.1
4bcc7bd1
PH
459 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
460 preference -= 0.5
461
462 if f.get('vcodec') == 'none': # audio only
463 if self._downloader.params.get('prefer_free_formats'):
464 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
465 else:
466 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
467 ext_preference = 0
468 try:
469 audio_ext_preference = ORDER.index(f['ext'])
470 except ValueError:
471 audio_ext_preference = -1
472 else:
473 if self._downloader.params.get('prefer_free_formats'):
474 ORDER = [u'flv', u'mp4', u'webm']
475 else:
476 ORDER = [u'webm', u'flv', u'mp4']
477 try:
478 ext_preference = ORDER.index(f['ext'])
479 except ValueError:
480 ext_preference = -1
481 audio_ext_preference = 0
482
483 return (
484 preference,
485 f.get('height') if f.get('height') is not None else -1,
486 f.get('width') if f.get('width') is not None else -1,
487 ext_preference,
488 f.get('vbr') if f.get('vbr') is not None else -1,
489 f.get('abr') if f.get('abr') is not None else -1,
490 audio_ext_preference,
491 f.get('filesize') if f.get('filesize') is not None else -1,
492 f.get('format_id'),
493 )
494 formats.sort(key=_formats_key)
59040888 495
8dbe9899 496
d6983cb4
PH
497class SearchInfoExtractor(InfoExtractor):
498 """
499 Base class for paged search queries extractors.
500 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
501 Instances should define _SEARCH_KEY and _MAX_RESULTS.
502 """
503
504 @classmethod
505 def _make_valid_url(cls):
506 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
507
508 @classmethod
509 def suitable(cls, url):
510 return re.match(cls._make_valid_url(), url) is not None
511
512 def _real_extract(self, query):
513 mobj = re.match(self._make_valid_url(), query)
514 if mobj is None:
515 raise ExtractorError(u'Invalid search query "%s"' % query)
516
517 prefix = mobj.group('prefix')
518 query = mobj.group('query')
519 if prefix == '':
520 return self._get_n_results(query, 1)
521 elif prefix == 'all':
522 return self._get_n_results(query, self._MAX_RESULTS)
523 else:
524 n = int(prefix)
525 if n <= 0:
526 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
527 elif n > self._MAX_RESULTS:
528 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
529 n = self._MAX_RESULTS
530 return self._get_n_results(query, n)
531
532 def _get_n_results(self, query, n):
533 """Get a specified number of results for a query"""
416a5efc 534 raise NotImplementedError("This method must be implemented by subclasses")
0f818663
PH
535
536 @property
537 def SEARCH_KEY(self):
538 return self._SEARCH_KEY