]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
Work around buggy HTML Parser in Python < 2.7.3 (Closes #662)
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import gzip
5 import io
6 import json
7 import locale
8 import os
9 import re
10 import sys
11 import traceback
12 import zlib
13 import email.utils
14 import json
15
16 try:
17 import urllib.request as compat_urllib_request
18 except ImportError: # Python 2
19 import urllib2 as compat_urllib_request
20
21 try:
22 import urllib.error as compat_urllib_error
23 except ImportError: # Python 2
24 import urllib2 as compat_urllib_error
25
26 try:
27 import urllib.parse as compat_urllib_parse
28 except ImportError: # Python 2
29 import urllib as compat_urllib_parse
30
31 try:
32 from urllib.parse import urlparse as compat_urllib_parse_urlparse
33 except ImportError: # Python 2
34 from urlparse import urlparse as compat_urllib_parse_urlparse
35
36 try:
37 import http.cookiejar as compat_cookiejar
38 except ImportError: # Python 2
39 import cookielib as compat_cookiejar
40
41 try:
42 import html.entities as compat_html_entities
43 except ImportError: # Python 2
44 import htmlentitydefs as compat_html_entities
45
46 try:
47 import html.parser as compat_html_parser
48 except ImportError: # Python 2
49 import HTMLParser as compat_html_parser
50
51 try:
52 import http.client as compat_http_client
53 except ImportError: # Python 2
54 import httplib as compat_http_client
55
56 try:
57 from subprocess import DEVNULL
58 compat_subprocess_get_DEVNULL = lambda: DEVNULL
59 except ImportError:
60 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
61
62 try:
63 from urllib.parse import parse_qs as compat_parse_qs
64 except ImportError: # Python 2
65 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
66 # Python 2's version is apparently totally broken
67 def _unquote(string, encoding='utf-8', errors='replace'):
68 if string == '':
69 return string
70 res = string.split('%')
71 if len(res) == 1:
72 return string
73 if encoding is None:
74 encoding = 'utf-8'
75 if errors is None:
76 errors = 'replace'
77 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
78 pct_sequence = b''
79 string = res[0]
80 for item in res[1:]:
81 try:
82 if not item:
83 raise ValueError
84 pct_sequence += item[:2].decode('hex')
85 rest = item[2:]
86 if not rest:
87 # This segment was just a single percent-encoded character.
88 # May be part of a sequence of code units, so delay decoding.
89 # (Stored in pct_sequence).
90 continue
91 except ValueError:
92 rest = '%' + item
93 # Encountered non-percent-encoded characters. Flush the current
94 # pct_sequence.
95 string += pct_sequence.decode(encoding, errors) + rest
96 pct_sequence = b''
97 if pct_sequence:
98 # Flush the final pct_sequence
99 string += pct_sequence.decode(encoding, errors)
100 return string
101
102 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
103 encoding='utf-8', errors='replace'):
104 qs, _coerce_result = qs, unicode
105 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
106 r = []
107 for name_value in pairs:
108 if not name_value and not strict_parsing:
109 continue
110 nv = name_value.split('=', 1)
111 if len(nv) != 2:
112 if strict_parsing:
113 raise ValueError("bad query field: %r" % (name_value,))
114 # Handle case of a control-name with no equal sign
115 if keep_blank_values:
116 nv.append('')
117 else:
118 continue
119 if len(nv[1]) or keep_blank_values:
120 name = nv[0].replace('+', ' ')
121 name = _unquote(name, encoding=encoding, errors=errors)
122 name = _coerce_result(name)
123 value = nv[1].replace('+', ' ')
124 value = _unquote(value, encoding=encoding, errors=errors)
125 value = _coerce_result(value)
126 r.append((name, value))
127 return r
128
129 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
130 encoding='utf-8', errors='replace'):
131 parsed_result = {}
132 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
133 encoding=encoding, errors=errors)
134 for name, value in pairs:
135 if name in parsed_result:
136 parsed_result[name].append(value)
137 else:
138 parsed_result[name] = [value]
139 return parsed_result
140
141 try:
142 compat_str = unicode # Python 2
143 except NameError:
144 compat_str = str
145
146 try:
147 compat_chr = unichr # Python 2
148 except NameError:
149 compat_chr = chr
150
151 std_headers = {
152 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
153 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
154 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
155 'Accept-Encoding': 'gzip, deflate',
156 'Accept-Language': 'en-us,en;q=0.5',
157 }
158
159 def preferredencoding():
160 """Get preferred encoding.
161
162 Returns the best encoding scheme for the system, based on
163 locale.getpreferredencoding() and some further tweaks.
164 """
165 try:
166 pref = locale.getpreferredencoding()
167 u'TEST'.encode(pref)
168 except:
169 pref = 'UTF-8'
170
171 return pref
172
173 if sys.version_info < (3,0):
174 def compat_print(s):
175 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
176 else:
177 def compat_print(s):
178 assert type(s) == type(u'')
179 print(s)
180
181 # In Python 2.x, json.dump expects a bytestream.
182 # In Python 3.x, it writes to a character stream
183 if sys.version_info < (3,0):
184 def write_json_file(obj, fn):
185 with open(fn, 'wb') as f:
186 json.dump(obj, f)
187 else:
188 def write_json_file(obj, fn):
189 with open(fn, 'w', encoding='utf-8') as f:
190 json.dump(obj, f)
191
192 def htmlentity_transform(matchobj):
193 """Transforms an HTML entity to a character.
194
195 This function receives a match object and is intended to be used with
196 the re.sub() function.
197 """
198 entity = matchobj.group(1)
199
200 # Known non-numeric HTML entity
201 if entity in compat_html_entities.name2codepoint:
202 return compat_chr(compat_html_entities.name2codepoint[entity])
203
204 mobj = re.match(u'(?u)#(x?\\d+)', entity)
205 if mobj is not None:
206 numstr = mobj.group(1)
207 if numstr.startswith(u'x'):
208 base = 16
209 numstr = u'0%s' % numstr
210 else:
211 base = 10
212 return compat_chr(int(numstr, base))
213
214 # Unknown entity in name, return its literal representation
215 return (u'&%s;' % entity)
216
217 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
218 class AttrParser(compat_html_parser.HTMLParser):
219 """Modified HTMLParser that isolates a tag with the specified attribute"""
220 def __init__(self, attribute, value):
221 self.attribute = attribute
222 self.value = value
223 self.result = None
224 self.started = False
225 self.depth = {}
226 self.html = None
227 self.watch_startpos = False
228 self.error_count = 0
229 compat_html_parser.HTMLParser.__init__(self)
230
231 def error(self, message):
232 if self.error_count > 10 or self.started:
233 raise compat_html_parser.HTMLParseError(message, self.getpos())
234 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
235 self.error_count += 1
236 self.goahead(1)
237
238 def loads(self, html):
239 self.html = html
240 self.feed(html)
241 self.close()
242
243 def handle_starttag(self, tag, attrs):
244 attrs = dict(attrs)
245 if self.started:
246 self.find_startpos(None)
247 if self.attribute in attrs and attrs[self.attribute] == self.value:
248 self.result = [tag]
249 self.started = True
250 self.watch_startpos = True
251 if self.started:
252 if not tag in self.depth: self.depth[tag] = 0
253 self.depth[tag] += 1
254
255 def handle_endtag(self, tag):
256 if self.started:
257 if tag in self.depth: self.depth[tag] -= 1
258 if self.depth[self.result[0]] == 0:
259 self.started = False
260 self.result.append(self.getpos())
261
262 def find_startpos(self, x):
263 """Needed to put the start position of the result (self.result[1])
264 after the opening tag with the requested id"""
265 if self.watch_startpos:
266 self.watch_startpos = False
267 self.result.append(self.getpos())
268 handle_entityref = handle_charref = handle_data = handle_comment = \
269 handle_decl = handle_pi = unknown_decl = find_startpos
270
271 def get_result(self):
272 if self.result is None:
273 return None
274 if len(self.result) != 3:
275 return None
276 lines = self.html.split('\n')
277 lines = lines[self.result[1][0]-1:self.result[2][0]]
278 lines[0] = lines[0][self.result[1][1]:]
279 if len(lines) == 1:
280 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
281 lines[-1] = lines[-1][:self.result[2][1]]
282 return '\n'.join(lines).strip()
283 # Hack for https://github.com/rg3/youtube-dl/issues/662
284 if sys.version_info < (2, 7, 3):
285 AttrParser.parse_endtag = (lambda self, i:
286 i + len("</scr'+'ipt>")
287 if self.rawdata[i:].startswith("</scr'+'ipt>")
288 else compat_html_parser.HTMLParser.parse_endtag(self, i))
289
290 def get_element_by_id(id, html):
291 """Return the content of the tag with the specified ID in the passed HTML document"""
292 return get_element_by_attribute("id", id, html)
293
294 def get_element_by_attribute(attribute, value, html):
295 """Return the content of the tag with the specified attribute in the passed HTML document"""
296 parser = AttrParser(attribute, value)
297 try:
298 parser.loads(html)
299 except compat_html_parser.HTMLParseError:
300 pass
301 return parser.get_result()
302
303
304 def clean_html(html):
305 """Clean an HTML snippet into a readable string"""
306 # Newline vs <br />
307 html = html.replace('\n', ' ')
308 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
309 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
310 # Strip html tags
311 html = re.sub('<.*?>', '', html)
312 # Replace html entities
313 html = unescapeHTML(html)
314 return html
315
316
317 def sanitize_open(filename, open_mode):
318 """Try to open the given filename, and slightly tweak it if this fails.
319
320 Attempts to open the given filename. If this fails, it tries to change
321 the filename slightly, step by step, until it's either able to open it
322 or it fails and raises a final exception, like the standard open()
323 function.
324
325 It returns the tuple (stream, definitive_file_name).
326 """
327 try:
328 if filename == u'-':
329 if sys.platform == 'win32':
330 import msvcrt
331 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
332 return (sys.stdout, filename)
333 stream = open(encodeFilename(filename), open_mode)
334 return (stream, filename)
335 except (IOError, OSError) as err:
336 # In case of error, try to remove win32 forbidden chars
337 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
338
339 # An exception here should be caught in the caller
340 stream = open(encodeFilename(filename), open_mode)
341 return (stream, filename)
342
343
344 def timeconvert(timestr):
345 """Convert RFC 2822 defined time string into system timestamp"""
346 timestamp = None
347 timetuple = email.utils.parsedate_tz(timestr)
348 if timetuple is not None:
349 timestamp = email.utils.mktime_tz(timetuple)
350 return timestamp
351
352 def sanitize_filename(s, restricted=False, is_id=False):
353 """Sanitizes a string so it could be used as part of a filename.
354 If restricted is set, use a stricter subset of allowed characters.
355 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
356 """
357 def replace_insane(char):
358 if char == '?' or ord(char) < 32 or ord(char) == 127:
359 return ''
360 elif char == '"':
361 return '' if restricted else '\''
362 elif char == ':':
363 return '_-' if restricted else ' -'
364 elif char in '\\/|*<>':
365 return '_'
366 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
367 return '_'
368 if restricted and ord(char) > 127:
369 return '_'
370 return char
371
372 result = u''.join(map(replace_insane, s))
373 if not is_id:
374 while '__' in result:
375 result = result.replace('__', '_')
376 result = result.strip('_')
377 # Common case of "Foreign band name - English song title"
378 if restricted and result.startswith('-_'):
379 result = result[2:]
380 if not result:
381 result = '_'
382 return result
383
384 def orderedSet(iterable):
385 """ Remove all duplicates from the input iterable """
386 res = []
387 for el in iterable:
388 if el not in res:
389 res.append(el)
390 return res
391
392 def unescapeHTML(s):
393 """
394 @param s a string
395 """
396 assert type(s) == type(u'')
397
398 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
399 return result
400
401 def encodeFilename(s):
402 """
403 @param s The name of the file
404 """
405
406 assert type(s) == type(u'')
407
408 # Python 3 has a Unicode API
409 if sys.version_info >= (3, 0):
410 return s
411
412 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
413 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
414 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
415 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
416 return s
417 else:
418 encoding = sys.getfilesystemencoding()
419 if encoding is None:
420 encoding = 'utf-8'
421 return s.encode(encoding, 'ignore')
422
423
424 class ExtractorError(Exception):
425 """Error during info extraction."""
426 def __init__(self, msg, tb=None):
427 """ tb, if given, is the original traceback (so that it can be printed out). """
428 super(ExtractorError, self).__init__(msg)
429 self.traceback = tb
430
431 def format_traceback(self):
432 if self.traceback is None:
433 return None
434 return u''.join(traceback.format_tb(self.traceback))
435
436
437 class DownloadError(Exception):
438 """Download Error exception.
439
440 This exception may be thrown by FileDownloader objects if they are not
441 configured to continue on errors. They will contain the appropriate
442 error message.
443 """
444 pass
445
446
447 class SameFileError(Exception):
448 """Same File exception.
449
450 This exception will be thrown by FileDownloader objects if they detect
451 multiple files would have to be downloaded to the same file on disk.
452 """
453 pass
454
455
456 class PostProcessingError(Exception):
457 """Post Processing exception.
458
459 This exception may be raised by PostProcessor's .run() method to
460 indicate an error in the postprocessing task.
461 """
462 def __init__(self, msg):
463 self.msg = msg
464
465 class MaxDownloadsReached(Exception):
466 """ --max-downloads limit has been reached. """
467 pass
468
469
470 class UnavailableVideoError(Exception):
471 """Unavailable Format exception.
472
473 This exception will be thrown when a video is requested
474 in a format that is not available for that video.
475 """
476 pass
477
478
479 class ContentTooShortError(Exception):
480 """Content Too Short exception.
481
482 This exception may be raised by FileDownloader objects when a file they
483 download is too small for what the server announced first, indicating
484 the connection was probably interrupted.
485 """
486 # Both in bytes
487 downloaded = None
488 expected = None
489
490 def __init__(self, downloaded, expected):
491 self.downloaded = downloaded
492 self.expected = expected
493
494 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
495 """Handler for HTTP requests and responses.
496
497 This class, when installed with an OpenerDirector, automatically adds
498 the standard headers to every HTTP request and handles gzipped and
499 deflated responses from web servers. If compression is to be avoided in
500 a particular request, the original request in the program code only has
501 to include the HTTP header "Youtubedl-No-Compression", which will be
502 removed before making the real request.
503
504 Part of this code was copied from:
505
506 http://techknack.net/python-urllib2-handlers/
507
508 Andrew Rowls, the author of that code, agreed to release it to the
509 public domain.
510 """
511
512 @staticmethod
513 def deflate(data):
514 try:
515 return zlib.decompress(data, -zlib.MAX_WBITS)
516 except zlib.error:
517 return zlib.decompress(data)
518
519 @staticmethod
520 def addinfourl_wrapper(stream, headers, url, code):
521 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
522 return compat_urllib_request.addinfourl(stream, headers, url, code)
523 ret = compat_urllib_request.addinfourl(stream, headers, url)
524 ret.code = code
525 return ret
526
527 def http_request(self, req):
528 for h,v in std_headers.items():
529 if h in req.headers:
530 del req.headers[h]
531 req.add_header(h, v)
532 if 'Youtubedl-no-compression' in req.headers:
533 if 'Accept-encoding' in req.headers:
534 del req.headers['Accept-encoding']
535 del req.headers['Youtubedl-no-compression']
536 if 'Youtubedl-user-agent' in req.headers:
537 if 'User-agent' in req.headers:
538 del req.headers['User-agent']
539 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
540 del req.headers['Youtubedl-user-agent']
541 return req
542
543 def http_response(self, req, resp):
544 old_resp = resp
545 # gzip
546 if resp.headers.get('Content-encoding', '') == 'gzip':
547 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
548 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
549 resp.msg = old_resp.msg
550 # deflate
551 if resp.headers.get('Content-encoding', '') == 'deflate':
552 gz = io.BytesIO(self.deflate(resp.read()))
553 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
554 resp.msg = old_resp.msg
555 return resp
556
557 https_request = http_request
558 https_response = http_response