]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
Fix --match-title and --reject-title decoding (Closes #690)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4import gzip
03f9daab 5import io
f4bfd65f 6import json
d77c3dfd
FV
7import locale
8import os
9import re
10import sys
01951dda 11import traceback
d77c3dfd 12import zlib
d77c3dfd 13import email.utils
921a1455 14import json
d77c3dfd 15
01ba00ca 16try:
59ae15a5 17 import urllib.request as compat_urllib_request
01ba00ca 18except ImportError: # Python 2
59ae15a5 19 import urllib2 as compat_urllib_request
01ba00ca
PH
20
21try:
59ae15a5 22 import urllib.error as compat_urllib_error
01ba00ca 23except ImportError: # Python 2
59ae15a5 24 import urllib2 as compat_urllib_error
01ba00ca
PH
25
26try:
59ae15a5 27 import urllib.parse as compat_urllib_parse
01ba00ca 28except ImportError: # Python 2
59ae15a5 29 import urllib as compat_urllib_parse
01ba00ca 30
799c0763
PH
31try:
32 from urllib.parse import urlparse as compat_urllib_parse_urlparse
33except ImportError: # Python 2
34 from urlparse import urlparse as compat_urllib_parse_urlparse
35
01ba00ca 36try:
59ae15a5 37 import http.cookiejar as compat_cookiejar
01ba00ca 38except ImportError: # Python 2
59ae15a5 39 import cookielib as compat_cookiejar
01ba00ca 40
3e669f36 41try:
59ae15a5 42 import html.entities as compat_html_entities
9f37a959 43except ImportError: # Python 2
59ae15a5 44 import htmlentitydefs as compat_html_entities
3e669f36 45
a8156c1d 46try:
59ae15a5 47 import html.parser as compat_html_parser
9f37a959 48except ImportError: # Python 2
59ae15a5 49 import HTMLParser as compat_html_parser
a8156c1d 50
348d0a7a 51try:
59ae15a5 52 import http.client as compat_http_client
9f37a959 53except ImportError: # Python 2
59ae15a5 54 import httplib as compat_http_client
348d0a7a 55
5910e210
PH
56try:
57 from subprocess import DEVNULL
58 compat_subprocess_get_DEVNULL = lambda: DEVNULL
59except ImportError:
60 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
61
9f37a959 62try:
59ae15a5 63 from urllib.parse import parse_qs as compat_parse_qs
9f37a959 64except ImportError: # Python 2
59ae15a5
PH
65 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
66 # Python 2's version is apparently totally broken
67 def _unquote(string, encoding='utf-8', errors='replace'):
68 if string == '':
69 return string
70 res = string.split('%')
71 if len(res) == 1:
72 return string
73 if encoding is None:
74 encoding = 'utf-8'
75 if errors is None:
76 errors = 'replace'
77 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
78 pct_sequence = b''
79 string = res[0]
80 for item in res[1:]:
81 try:
82 if not item:
83 raise ValueError
84 pct_sequence += item[:2].decode('hex')
85 rest = item[2:]
86 if not rest:
87 # This segment was just a single percent-encoded character.
88 # May be part of a sequence of code units, so delay decoding.
89 # (Stored in pct_sequence).
90 continue
91 except ValueError:
92 rest = '%' + item
93 # Encountered non-percent-encoded characters. Flush the current
94 # pct_sequence.
95 string += pct_sequence.decode(encoding, errors) + rest
96 pct_sequence = b''
97 if pct_sequence:
98 # Flush the final pct_sequence
99 string += pct_sequence.decode(encoding, errors)
100 return string
101
102 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
103 encoding='utf-8', errors='replace'):
104 qs, _coerce_result = qs, unicode
105 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
106 r = []
107 for name_value in pairs:
108 if not name_value and not strict_parsing:
109 continue
110 nv = name_value.split('=', 1)
111 if len(nv) != 2:
112 if strict_parsing:
113 raise ValueError("bad query field: %r" % (name_value,))
114 # Handle case of a control-name with no equal sign
115 if keep_blank_values:
116 nv.append('')
117 else:
118 continue
119 if len(nv[1]) or keep_blank_values:
120 name = nv[0].replace('+', ' ')
121 name = _unquote(name, encoding=encoding, errors=errors)
122 name = _coerce_result(name)
123 value = nv[1].replace('+', ' ')
124 value = _unquote(value, encoding=encoding, errors=errors)
125 value = _coerce_result(value)
126 r.append((name, value))
127 return r
128
129 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
130 encoding='utf-8', errors='replace'):
131 parsed_result = {}
132 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
133 encoding=encoding, errors=errors)
134 for name, value in pairs:
135 if name in parsed_result:
136 parsed_result[name].append(value)
137 else:
138 parsed_result[name] = [value]
139 return parsed_result
348d0a7a 140
3e669f36 141try:
59ae15a5 142 compat_str = unicode # Python 2
3e669f36 143except NameError:
59ae15a5 144 compat_str = str
3e669f36
PH
145
146try:
59ae15a5 147 compat_chr = unichr # Python 2
3e669f36 148except NameError:
59ae15a5 149 compat_chr = chr
3e669f36 150
3e669f36 151std_headers = {
59ae15a5
PH
152 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
153 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
154 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
155 'Accept-Encoding': 'gzip, deflate',
156 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 157}
f427df17 158
d77c3dfd 159def preferredencoding():
59ae15a5 160 """Get preferred encoding.
d77c3dfd 161
59ae15a5
PH
162 Returns the best encoding scheme for the system, based on
163 locale.getpreferredencoding() and some further tweaks.
164 """
165 try:
166 pref = locale.getpreferredencoding()
167 u'TEST'.encode(pref)
168 except:
169 pref = 'UTF-8'
bae611f2 170
59ae15a5 171 return pref
d77c3dfd 172
8cd10ac4 173if sys.version_info < (3,0):
59ae15a5
PH
174 def compat_print(s):
175 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 176else:
59ae15a5
PH
177 def compat_print(s):
178 assert type(s) == type(u'')
179 print(s)
d77c3dfd 180
f4bfd65f
PH
181# In Python 2.x, json.dump expects a bytestream.
182# In Python 3.x, it writes to a character stream
183if sys.version_info < (3,0):
184 def write_json_file(obj, fn):
185 with open(fn, 'wb') as f:
186 json.dump(obj, f)
187else:
188 def write_json_file(obj, fn):
189 with open(fn, 'w', encoding='utf-8') as f:
190 json.dump(obj, f)
191
d77c3dfd 192def htmlentity_transform(matchobj):
59ae15a5
PH
193 """Transforms an HTML entity to a character.
194
195 This function receives a match object and is intended to be used with
196 the re.sub() function.
197 """
198 entity = matchobj.group(1)
199
200 # Known non-numeric HTML entity
201 if entity in compat_html_entities.name2codepoint:
202 return compat_chr(compat_html_entities.name2codepoint[entity])
203
204 mobj = re.match(u'(?u)#(x?\\d+)', entity)
205 if mobj is not None:
206 numstr = mobj.group(1)
207 if numstr.startswith(u'x'):
208 base = 16
209 numstr = u'0%s' % numstr
210 else:
211 base = 10
212 return compat_chr(int(numstr, base))
213
214 # Unknown entity in name, return its literal representation
215 return (u'&%s;' % entity)
d77c3dfd 216
a8156c1d 217compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
43e8fafd
ND
218class AttrParser(compat_html_parser.HTMLParser):
219 """Modified HTMLParser that isolates a tag with the specified attribute"""
220 def __init__(self, attribute, value):
221 self.attribute = attribute
222 self.value = value
59ae15a5
PH
223 self.result = None
224 self.started = False
225 self.depth = {}
226 self.html = None
227 self.watch_startpos = False
228 self.error_count = 0
229 compat_html_parser.HTMLParser.__init__(self)
230
231 def error(self, message):
232 if self.error_count > 10 or self.started:
233 raise compat_html_parser.HTMLParseError(message, self.getpos())
234 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
235 self.error_count += 1
236 self.goahead(1)
237
238 def loads(self, html):
239 self.html = html
240 self.feed(html)
241 self.close()
242
243 def handle_starttag(self, tag, attrs):
244 attrs = dict(attrs)
245 if self.started:
246 self.find_startpos(None)
43e8fafd 247 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
248 self.result = [tag]
249 self.started = True
250 self.watch_startpos = True
251 if self.started:
252 if not tag in self.depth: self.depth[tag] = 0
253 self.depth[tag] += 1
254
255 def handle_endtag(self, tag):
256 if self.started:
257 if tag in self.depth: self.depth[tag] -= 1
258 if self.depth[self.result[0]] == 0:
259 self.started = False
260 self.result.append(self.getpos())
261
262 def find_startpos(self, x):
263 """Needed to put the start position of the result (self.result[1])
264 after the opening tag with the requested id"""
265 if self.watch_startpos:
266 self.watch_startpos = False
267 self.result.append(self.getpos())
268 handle_entityref = handle_charref = handle_data = handle_comment = \
269 handle_decl = handle_pi = unknown_decl = find_startpos
270
271 def get_result(self):
272 if self.result is None:
273 return None
274 if len(self.result) != 3:
275 return None
276 lines = self.html.split('\n')
277 lines = lines[self.result[1][0]-1:self.result[2][0]]
278 lines[0] = lines[0][self.result[1][1]:]
279 if len(lines) == 1:
280 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
281 lines[-1] = lines[-1][:self.result[2][1]]
282 return '\n'.join(lines).strip()
3b024e17
PH
283# Hack for https://github.com/rg3/youtube-dl/issues/662
284if sys.version_info < (2, 7, 3):
285 AttrParser.parse_endtag = (lambda self, i:
286 i + len("</scr'+'ipt>")
287 if self.rawdata[i:].startswith("</scr'+'ipt>")
288 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
289
290def get_element_by_id(id, html):
43e8fafd
ND
291 """Return the content of the tag with the specified ID in the passed HTML document"""
292 return get_element_by_attribute("id", id, html)
293
294def get_element_by_attribute(attribute, value, html):
295 """Return the content of the tag with the specified attribute in the passed HTML document"""
296 parser = AttrParser(attribute, value)
59ae15a5
PH
297 try:
298 parser.loads(html)
299 except compat_html_parser.HTMLParseError:
300 pass
301 return parser.get_result()
9e6dd238
FV
302
303
304def clean_html(html):
59ae15a5
PH
305 """Clean an HTML snippet into a readable string"""
306 # Newline vs <br />
307 html = html.replace('\n', ' ')
6b3aef80
FV
308 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
309 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
310 # Strip html tags
311 html = re.sub('<.*?>', '', html)
312 # Replace html entities
313 html = unescapeHTML(html)
314 return html
9e6dd238
FV
315
316
d77c3dfd 317def sanitize_open(filename, open_mode):
59ae15a5
PH
318 """Try to open the given filename, and slightly tweak it if this fails.
319
320 Attempts to open the given filename. If this fails, it tries to change
321 the filename slightly, step by step, until it's either able to open it
322 or it fails and raises a final exception, like the standard open()
323 function.
324
325 It returns the tuple (stream, definitive_file_name).
326 """
327 try:
328 if filename == u'-':
329 if sys.platform == 'win32':
330 import msvcrt
331 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
332 return (sys.stdout, filename)
333 stream = open(encodeFilename(filename), open_mode)
334 return (stream, filename)
335 except (IOError, OSError) as err:
336 # In case of error, try to remove win32 forbidden chars
337 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
338
339 # An exception here should be caught in the caller
340 stream = open(encodeFilename(filename), open_mode)
341 return (stream, filename)
d77c3dfd
FV
342
343
344def timeconvert(timestr):
59ae15a5
PH
345 """Convert RFC 2822 defined time string into system timestamp"""
346 timestamp = None
347 timetuple = email.utils.parsedate_tz(timestr)
348 if timetuple is not None:
349 timestamp = email.utils.mktime_tz(timetuple)
350 return timestamp
1c469a94 351
796173d0 352def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
353 """Sanitizes a string so it could be used as part of a filename.
354 If restricted is set, use a stricter subset of allowed characters.
796173d0 355 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
356 """
357 def replace_insane(char):
358 if char == '?' or ord(char) < 32 or ord(char) == 127:
359 return ''
360 elif char == '"':
361 return '' if restricted else '\''
362 elif char == ':':
363 return '_-' if restricted else ' -'
364 elif char in '\\/|*<>':
365 return '_'
627dcfff 366 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
367 return '_'
368 if restricted and ord(char) > 127:
369 return '_'
370 return char
371
372 result = u''.join(map(replace_insane, s))
796173d0
PH
373 if not is_id:
374 while '__' in result:
375 result = result.replace('__', '_')
376 result = result.strip('_')
377 # Common case of "Foreign band name - English song title"
378 if restricted and result.startswith('-_'):
379 result = result[2:]
380 if not result:
381 result = '_'
59ae15a5 382 return result
d77c3dfd
FV
383
384def orderedSet(iterable):
59ae15a5
PH
385 """ Remove all duplicates from the input iterable """
386 res = []
387 for el in iterable:
388 if el not in res:
389 res.append(el)
390 return res
d77c3dfd
FV
391
392def unescapeHTML(s):
59ae15a5
PH
393 """
394 @param s a string
395 """
396 assert type(s) == type(u'')
d77c3dfd 397
59ae15a5
PH
398 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
399 return result
d77c3dfd
FV
400
401def encodeFilename(s):
59ae15a5
PH
402 """
403 @param s The name of the file
404 """
d77c3dfd 405
59ae15a5 406 assert type(s) == type(u'')
d77c3dfd 407
59ae15a5
PH
408 # Python 3 has a Unicode API
409 if sys.version_info >= (3, 0):
410 return s
0f00efed 411
59ae15a5
PH
412 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
413 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
414 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
415 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
416 return s
417 else:
6df40dcb
PH
418 encoding = sys.getfilesystemencoding()
419 if encoding is None:
420 encoding = 'utf-8'
421 return s.encode(encoding, 'ignore')
d77c3dfd 422
8271226a
PH
423def decodeOption(optval):
424 if optval is None:
425 return optval
426 if isinstance(optval, bytes):
427 optval = optval.decode(preferredencoding())
428
429 assert isinstance(optval, compat_str)
430 return optval
1c256f70
PH
431
432class ExtractorError(Exception):
433 """Error during info extraction."""
434 def __init__(self, msg, tb=None):
01951dda 435 """ tb, if given, is the original traceback (so that it can be printed out). """
1c256f70 436 super(ExtractorError, self).__init__(msg)
1c256f70
PH
437 self.traceback = tb
438
01951dda
PH
439 def format_traceback(self):
440 if self.traceback is None:
441 return None
442 return u''.join(traceback.format_tb(self.traceback))
443
1c256f70 444
d77c3dfd 445class DownloadError(Exception):
59ae15a5 446 """Download Error exception.
d77c3dfd 447
59ae15a5
PH
448 This exception may be thrown by FileDownloader objects if they are not
449 configured to continue on errors. They will contain the appropriate
450 error message.
451 """
452 pass
d77c3dfd
FV
453
454
455class SameFileError(Exception):
59ae15a5 456 """Same File exception.
d77c3dfd 457
59ae15a5
PH
458 This exception will be thrown by FileDownloader objects if they detect
459 multiple files would have to be downloaded to the same file on disk.
460 """
461 pass
d77c3dfd
FV
462
463
464class PostProcessingError(Exception):
59ae15a5 465 """Post Processing exception.
d77c3dfd 466
59ae15a5
PH
467 This exception may be raised by PostProcessor's .run() method to
468 indicate an error in the postprocessing task.
469 """
7851b379
PH
470 def __init__(self, msg):
471 self.msg = msg
d77c3dfd
FV
472
473class MaxDownloadsReached(Exception):
59ae15a5
PH
474 """ --max-downloads limit has been reached. """
475 pass
d77c3dfd
FV
476
477
478class UnavailableVideoError(Exception):
59ae15a5 479 """Unavailable Format exception.
d77c3dfd 480
59ae15a5
PH
481 This exception will be thrown when a video is requested
482 in a format that is not available for that video.
483 """
484 pass
d77c3dfd
FV
485
486
487class ContentTooShortError(Exception):
59ae15a5 488 """Content Too Short exception.
d77c3dfd 489
59ae15a5
PH
490 This exception may be raised by FileDownloader objects when a file they
491 download is too small for what the server announced first, indicating
492 the connection was probably interrupted.
493 """
494 # Both in bytes
495 downloaded = None
496 expected = None
d77c3dfd 497
59ae15a5
PH
498 def __init__(self, downloaded, expected):
499 self.downloaded = downloaded
500 self.expected = expected
d77c3dfd 501
01ba00ca 502class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
503 """Handler for HTTP requests and responses.
504
505 This class, when installed with an OpenerDirector, automatically adds
506 the standard headers to every HTTP request and handles gzipped and
507 deflated responses from web servers. If compression is to be avoided in
508 a particular request, the original request in the program code only has
509 to include the HTTP header "Youtubedl-No-Compression", which will be
510 removed before making the real request.
511
512 Part of this code was copied from:
513
514 http://techknack.net/python-urllib2-handlers/
515
516 Andrew Rowls, the author of that code, agreed to release it to the
517 public domain.
518 """
519
520 @staticmethod
521 def deflate(data):
522 try:
523 return zlib.decompress(data, -zlib.MAX_WBITS)
524 except zlib.error:
525 return zlib.decompress(data)
526
527 @staticmethod
528 def addinfourl_wrapper(stream, headers, url, code):
529 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
530 return compat_urllib_request.addinfourl(stream, headers, url, code)
531 ret = compat_urllib_request.addinfourl(stream, headers, url)
532 ret.code = code
533 return ret
534
535 def http_request(self, req):
335959e7 536 for h,v in std_headers.items():
59ae15a5
PH
537 if h in req.headers:
538 del req.headers[h]
335959e7 539 req.add_header(h, v)
59ae15a5
PH
540 if 'Youtubedl-no-compression' in req.headers:
541 if 'Accept-encoding' in req.headers:
542 del req.headers['Accept-encoding']
543 del req.headers['Youtubedl-no-compression']
3446dfb7 544 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
545 if 'User-agent' in req.headers:
546 del req.headers['User-agent']
547 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 548 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
549 return req
550
551 def http_response(self, req, resp):
552 old_resp = resp
553 # gzip
554 if resp.headers.get('Content-encoding', '') == 'gzip':
555 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
556 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
557 resp.msg = old_resp.msg
558 # deflate
559 if resp.headers.get('Content-encoding', '') == 'deflate':
560 gz = io.BytesIO(self.deflate(resp.read()))
561 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
562 resp.msg = old_resp.msg
563 return resp
0f8d03f8
PH
564
565 https_request = http_request
566 https_response = http_response