]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
[mtv] Fixup incorrectly encoded XML documents
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
d6983cb4
PH
1import base64
2import os
3import re
4import socket
5import sys
fc79158d 6import netrc
267ed0c5 7import xml.etree.ElementTree
d6983cb4
PH
8
9from ..utils import (
10 compat_http_client,
11 compat_urllib_error,
d6983cb4
PH
12 compat_str,
13
14 clean_html,
15 compiled_regex_type,
16 ExtractorError,
55b3e45b 17 RegexNotFoundError,
d41e6efc 18 sanitize_filename,
f38de77f 19 unescapeHTML,
d6983cb4
PH
20)
21
dca08720 22
d6983cb4
PH
23class InfoExtractor(object):
24 """Information Extractor class.
25
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
33
34 The dictionaries must include the following fields:
35
36 id: Video identifier.
37 url: Final video URL.
38 title: Video title, unescaped.
39 ext: Video filename extension.
40
2f5865cc
PH
41 Instead of url and ext, formats can also specified.
42
d6983cb4
PH
43 The following fields are optional:
44
45 format: The video format, defaults to ext (used for --get-format)
73e79f2a
PH
46 thumbnails: A list of dictionaries (with the entries "resolution" and
47 "url") for the varying thumbnails
d6983cb4
PH
48 thumbnail: Full URL to a video thumbnail image.
49 description: One-line video description.
50 uploader: Full name of the video uploader.
51 upload_date: Video upload date (YYYYMMDD).
52 uploader_id: Nickname or id of the video uploader.
53 location: Physical location of the video.
54 player_url: SWF Player URL (used for rtmpdump).
5d51a883
JMF
55 subtitles: The subtitle file contents as a dictionary in the format
56 {language: subtitles}.
f3d29461 57 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
58 like_count: Number of positive ratings of the video
59 dislike_count: Number of negative ratings of the video
60 comment_count: Number of comments on the video
d6983cb4
PH
61 urlhandle: [internal] The urlHandle to be used to download the file,
62 like returned by urllib.request.urlopen
8dbe9899 63 age_limit: Age restriction for the video, as an integer (years)
deefc05b
PH
64 formats: A list of dictionaries for each format available, it must
65 be ordered from worst to best quality. Potential fields:
66 * url Mandatory. The URL of the video file
67 * ext Will be calculated from url if missing
68 * format A human-readable description of the format
69 ("mp4 container with h264/opus").
b5d0d817 70 Calculated from the format_id, width, height.
8c51aa65 71 and format_note fields if missing.
deefc05b
PH
72 * format_id A short description of the format
73 ("mp4_h264_opus" or "19")
8c51aa65
JMF
74 * format_note Additional info about the format
75 ("3D" or "DASH video")
deefc05b
PH
76 * width Width of the video, if known
77 * height Height of the video, if known
91c7271a
PH
78 * abr Average audio bitrate in KBit/s
79 * acodec Name of the audio codec in use
80 * vbr Average video bitrate in KBit/s
81 * vcodec Name of the video codec in use
02dbf93f 82 * filesize The number of bytes, if known in advance
9103bbc5
JMF
83 webpage_url: The url to the video webpage, if given to youtube-dl it
84 should allow to get the same result again. (It will be set
85 by YoutubeDL if it's missing)
d6983cb4 86
deefc05b 87 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4
PH
88
89 Subclasses of this one should re-define the _real_initialize() and
90 _real_extract() methods and define a _VALID_URL regexp.
91 Probably, they should also be added to the list of extractors.
92
93 _real_extract() must return a *list* of information dictionaries as
94 described above.
95
96 Finally, the _WORKING attribute should be set to False for broken IEs
97 in order to warn the users and skip the tests.
98 """
99
100 _ready = False
101 _downloader = None
102 _WORKING = True
103
104 def __init__(self, downloader=None):
105 """Constructor. Receives an optional downloader."""
106 self._ready = False
107 self.set_downloader(downloader)
108
109 @classmethod
110 def suitable(cls, url):
111 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
112
113 # This does not use has/getattr intentionally - we want to know whether
114 # we have cached the regexp for *this* class, whereas getattr would also
115 # match the superclass
116 if '_VALID_URL_RE' not in cls.__dict__:
117 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
118 return cls._VALID_URL_RE.match(url) is not None
d6983cb4
PH
119
120 @classmethod
121 def working(cls):
122 """Getter method for _WORKING."""
123 return cls._WORKING
124
125 def initialize(self):
126 """Initializes an instance (authentication, etc)."""
127 if not self._ready:
128 self._real_initialize()
129 self._ready = True
130
131 def extract(self, url):
132 """Extracts URL information and returns it in list of dicts."""
133 self.initialize()
134 return self._real_extract(url)
135
136 def set_downloader(self, downloader):
137 """Sets the downloader for this IE."""
138 self._downloader = downloader
139
140 def _real_initialize(self):
141 """Real initialization process. Redefine in subclasses."""
142 pass
143
144 def _real_extract(self, url):
145 """Real extraction process. Redefine in subclasses."""
146 pass
147
56c73665
JMF
148 @classmethod
149 def ie_key(cls):
150 """A string for getting the InfoExtractor with get_info_extractor"""
151 return cls.__name__[:-2]
152
d6983cb4
PH
153 @property
154 def IE_NAME(self):
155 return type(self).__name__[:-2]
156
7cc3570e 157 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4
PH
158 """ Returns the response handle """
159 if note is None:
160 self.report_download_webpage(video_id)
161 elif note is not False:
7cc3570e
PH
162 if video_id is None:
163 self.to_screen(u'%s' % (note,))
164 else:
165 self.to_screen(u'%s: %s' % (video_id, note))
d6983cb4 166 try:
dca08720 167 return self._downloader.urlopen(url_or_request)
d6983cb4
PH
168 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
169 if errnote is None:
170 errnote = u'Unable to download webpage'
7cc3570e
PH
171 errmsg = u'%s: %s' % (errnote, compat_str(err))
172 if fatal:
173 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
174 else:
175 self._downloader.report_warning(errmsg)
176 return False
d6983cb4 177
7cc3570e 178 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 179 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
180
181 # Strip hashes from the URL (#1038)
182 if isinstance(url_or_request, (compat_str, str)):
183 url_or_request = url_or_request.partition('#')[0]
184
7cc3570e
PH
185 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
186 if urlh is False:
187 assert not fatal
188 return False
d6983cb4 189 content_type = urlh.headers.get('Content-Type', '')
f143d86a 190 webpage_bytes = urlh.read()
d6983cb4
PH
191 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
192 if m:
193 encoding = m.group(1)
194 else:
0d75ae2c 195 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
196 webpage_bytes[:1024])
197 if m:
198 encoding = m.group(1).decode('ascii')
199 else:
200 encoding = 'utf-8'
d6983cb4
PH
201 if self._downloader.params.get('dump_intermediate_pages', False):
202 try:
203 url = url_or_request.get_full_url()
204 except AttributeError:
205 url = url_or_request
206 self.to_screen(u'Dumping request to ' + url)
207 dump = base64.b64encode(webpage_bytes).decode('ascii')
208 self._downloader.to_screen(dump)
d41e6efc
PH
209 if self._downloader.params.get('write_pages', False):
210 try:
211 url = url_or_request.get_full_url()
212 except AttributeError:
213 url = url_or_request
214 raw_filename = ('%s_%s.dump' % (video_id, url))
215 filename = sanitize_filename(raw_filename, restricted=True)
216 self.to_screen(u'Saving request to ' + filename)
217 with open(filename, 'wb') as outf:
218 outf.write(webpage_bytes)
219
d6983cb4
PH
220 content = webpage_bytes.decode(encoding, 'replace')
221 return (content, urlh)
222
7cc3570e 223 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4 224 """ Returns the data of the page as a string """
7cc3570e
PH
225 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
226 if res is False:
227 return res
228 else:
229 content, _ = res
230 return content
d6983cb4 231
2a275ab0 232 def _download_xml(self, url_or_request, video_id,
e2b38da9
PH
233 note=u'Downloading XML', errnote=u'Unable to download XML',
234 transform_source=None):
267ed0c5
JMF
235 """Return the xml as an xml.etree.ElementTree.Element"""
236 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
e2b38da9
PH
237 if transform_source:
238 xml_string = transform_source(xml_string)
267ed0c5
JMF
239 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
240
d6983cb4
PH
241 def to_screen(self, msg):
242 """Print msg to screen, prefixing it with '[ie_name]'"""
243 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
244
245 def report_extraction(self, id_or_name):
246 """Report information extraction."""
247 self.to_screen(u'%s: Extracting information' % id_or_name)
248
249 def report_download_webpage(self, video_id):
250 """Report webpage download."""
251 self.to_screen(u'%s: Downloading webpage' % video_id)
252
253 def report_age_confirmation(self):
254 """Report attempt to confirm age."""
255 self.to_screen(u'Confirming age')
256
fc79158d
JMF
257 def report_login(self):
258 """Report attempt to log in."""
259 self.to_screen(u'Logging in')
260
d6983cb4 261 #Methods for following #608
7012b23c 262 def url_result(self, url, ie=None, video_id=None):
d6983cb4
PH
263 """Returns a url that points to a page that should be processed"""
264 #TODO: ie should be the class used for getting the info
265 video_info = {'_type': 'url',
266 'url': url,
267 'ie_key': ie}
7012b23c
PH
268 if video_id is not None:
269 video_info['id'] = video_id
d6983cb4
PH
270 return video_info
271 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
272 """Returns a playlist"""
273 video_info = {'_type': 'playlist',
274 'entries': entries}
275 if playlist_id:
276 video_info['id'] = playlist_id
277 if playlist_title:
278 video_info['title'] = playlist_title
279 return video_info
280
281 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
282 """
283 Perform a regex search on the given string, using a single or a list of
284 patterns returning the first matching group.
285 In case of failure return a default value or raise a WARNING or a
55b3e45b 286 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
287 """
288 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
289 mobj = re.search(pattern, string, flags)
290 else:
291 for p in pattern:
292 mobj = re.search(p, string, flags)
293 if mobj: break
294
295 if sys.stderr.isatty() and os.name != 'nt':
296 _name = u'\033[0;34m%s\033[0m' % name
297 else:
298 _name = name
299
300 if mobj:
301 # return the first matching group
302 return next(g for g in mobj.groups() if g is not None)
303 elif default is not None:
304 return default
305 elif fatal:
55b3e45b 306 raise RegexNotFoundError(u'Unable to extract %s' % _name)
d6983cb4
PH
307 else:
308 self._downloader.report_warning(u'unable to extract %s; '
98bcd283 309 u'please report this issue on http://yt-dl.org/bug' % _name)
d6983cb4
PH
310 return None
311
312 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
313 """
314 Like _search_regex, but strips HTML tags and unescapes entities.
315 """
316 res = self._search_regex(pattern, string, name, default, fatal, flags)
317 if res:
318 return clean_html(res).strip()
319 else:
320 return res
321
fc79158d
JMF
322 def _get_login_info(self):
323 """
324 Get the the login info as (username, password)
325 It will look in the netrc file using the _NETRC_MACHINE value
326 If there's no info available, return (None, None)
327 """
328 if self._downloader is None:
329 return (None, None)
330
331 username = None
332 password = None
333 downloader_params = self._downloader.params
334
335 # Attempt to use provided username and password or .netrc data
336 if downloader_params.get('username', None) is not None:
337 username = downloader_params['username']
338 password = downloader_params['password']
339 elif downloader_params.get('usenetrc', False):
340 try:
341 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
342 if info is not None:
343 username = info[0]
344 password = info[2]
345 else:
346 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
347 except (IOError, netrc.NetrcParseError) as err:
348 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
349
350 return (username, password)
351
46720279
JMF
352 # Helper functions for extracting OpenGraph info
353 @staticmethod
ab2d5247 354 def _og_regexes(prop):
78fb87b2
JMF
355 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
356 property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
357 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 358 return [
78fb87b2
JMF
359 template % (property_re, content_re),
360 template % (content_re, property_re),
ab2d5247 361 ]
46720279 362
3c4e6d83 363 def _og_search_property(self, prop, html, name=None, **kargs):
46720279 364 if name is None:
3c4e6d83 365 name = 'OpenGraph %s' % prop
ab2d5247 366 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
367 if escaped is None:
368 return None
369 return unescapeHTML(escaped)
46720279
JMF
370
371 def _og_search_thumbnail(self, html, **kargs):
3c4e6d83 372 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
46720279
JMF
373
374 def _og_search_description(self, html, **kargs):
375 return self._og_search_property('description', html, fatal=False, **kargs)
376
377 def _og_search_title(self, html, **kargs):
378 return self._og_search_property('title', html, **kargs)
379
8ffa13e0 380 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
ab2d5247
JMF
381 regexes = self._og_regexes('video')
382 if secure: regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 383 return self._html_search_regex(regexes, html, name, **kargs)
46720279 384
59040888
PH
385 def _html_search_meta(self, name, html, display_name=None):
386 if display_name is None:
387 display_name = name
388 return self._html_search_regex(
aaebed13
PH
389 r'''(?ix)<meta
390 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
59040888
PH
391 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
392 html, display_name, fatal=False)
393
394 def _dc_search_uploader(self, html):
395 return self._html_search_meta('dc.creator', html, 'uploader')
396
8dbe9899
PH
397 def _rta_search(self, html):
398 # See http://www.rtalabel.org/index.php?content=howtofaq#single
399 if re.search(r'(?ix)<meta\s+name="rating"\s+'
400 r' content="RTA-5042-1996-1400-1577-RTA"',
401 html):
402 return 18
403 return 0
404
59040888
PH
405 def _media_rating_search(self, html):
406 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
407 rating = self._html_search_meta('rating', html)
408
409 if not rating:
410 return None
411
412 RATING_TABLE = {
413 'safe for kids': 0,
414 'general': 8,
415 '14 years': 14,
416 'mature': 17,
417 'restricted': 19,
418 }
419 return RATING_TABLE.get(rating.lower(), None)
420
421
8dbe9899 422
d6983cb4
PH
423class SearchInfoExtractor(InfoExtractor):
424 """
425 Base class for paged search queries extractors.
426 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
427 Instances should define _SEARCH_KEY and _MAX_RESULTS.
428 """
429
430 @classmethod
431 def _make_valid_url(cls):
432 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
433
434 @classmethod
435 def suitable(cls, url):
436 return re.match(cls._make_valid_url(), url) is not None
437
438 def _real_extract(self, query):
439 mobj = re.match(self._make_valid_url(), query)
440 if mobj is None:
441 raise ExtractorError(u'Invalid search query "%s"' % query)
442
443 prefix = mobj.group('prefix')
444 query = mobj.group('query')
445 if prefix == '':
446 return self._get_n_results(query, 1)
447 elif prefix == 'all':
448 return self._get_n_results(query, self._MAX_RESULTS)
449 else:
450 n = int(prefix)
451 if n <= 0:
452 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
453 elif n > self._MAX_RESULTS:
454 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
455 n = self._MAX_RESULTS
456 return self._get_n_results(query, n)
457
458 def _get_n_results(self, query, n):
459 """Get a specified number of results for a query"""
416a5efc 460 raise NotImplementedError("This method must be implemented by subclasses")
0f818663
PH
461
462 @property
463 def SEARCH_KEY(self):
464 return self._SEARCH_KEY