2 # -*- coding: utf-8 -*-
18 import urllib
.request
as compat_urllib_request
19 except ImportError: # Python 2
20 import urllib2
as compat_urllib_request
23 import urllib
.error
as compat_urllib_error
24 except ImportError: # Python 2
25 import urllib2
as compat_urllib_error
28 import urllib
.parse
as compat_urllib_parse
29 except ImportError: # Python 2
30 import urllib
as compat_urllib_parse
33 from urllib
.parse
import urlparse
as compat_urllib_parse_urlparse
34 except ImportError: # Python 2
35 from urlparse
import urlparse
as compat_urllib_parse_urlparse
38 import http
.cookiejar
as compat_cookiejar
39 except ImportError: # Python 2
40 import cookielib
as compat_cookiejar
43 import html
.entities
as compat_html_entities
44 except ImportError: # Python 2
45 import htmlentitydefs
as compat_html_entities
48 import html
.parser
as compat_html_parser
49 except ImportError: # Python 2
50 import HTMLParser
as compat_html_parser
53 import http
.client
as compat_http_client
54 except ImportError: # Python 2
55 import httplib
as compat_http_client
58 from subprocess
import DEVNULL
59 compat_subprocess_get_DEVNULL
= lambda: DEVNULL
61 compat_subprocess_get_DEVNULL
= lambda: open(os
.path
.devnull
, 'w')
64 from urllib
.parse
import parse_qs
as compat_parse_qs
65 except ImportError: # Python 2
66 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
67 # Python 2's version is apparently totally broken
68 def _unquote(string
, encoding
='utf-8', errors
='replace'):
71 res
= string
.split('%')
78 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
85 pct_sequence
+= item
[:2].decode('hex')
88 # This segment was just a single percent-encoded character.
89 # May be part of a sequence of code units, so delay decoding.
90 # (Stored in pct_sequence).
94 # Encountered non-percent-encoded characters. Flush the current
96 string
+= pct_sequence
.decode(encoding
, errors
) + rest
99 # Flush the final pct_sequence
100 string
+= pct_sequence
.decode(encoding
, errors
)
103 def _parse_qsl(qs
, keep_blank_values
=False, strict_parsing
=False,
104 encoding
='utf-8', errors
='replace'):
105 qs
, _coerce_result
= qs
, unicode
106 pairs
= [s2
for s1
in qs
.split('&') for s2
in s1
.split(';')]
108 for name_value
in pairs
:
109 if not name_value
and not strict_parsing
:
111 nv
= name_value
.split('=', 1)
114 raise ValueError("bad query field: %r" % (name_value
,))
115 # Handle case of a control-name with no equal sign
116 if keep_blank_values
:
120 if len(nv
[1]) or keep_blank_values
:
121 name
= nv
[0].replace('+', ' ')
122 name
= _unquote(name
, encoding
=encoding
, errors
=errors
)
123 name
= _coerce_result(name
)
124 value
= nv
[1].replace('+', ' ')
125 value
= _unquote(value
, encoding
=encoding
, errors
=errors
)
126 value
= _coerce_result(value
)
127 r
.append((name
, value
))
130 def compat_parse_qs(qs
, keep_blank_values
=False, strict_parsing
=False,
131 encoding
='utf-8', errors
='replace'):
133 pairs
= _parse_qsl(qs
, keep_blank_values
, strict_parsing
,
134 encoding
=encoding
, errors
=errors
)
135 for name
, value
in pairs
:
136 if name
in parsed_result
:
137 parsed_result
[name
].append(value
)
139 parsed_result
[name
] = [value
]
143 compat_str
= unicode # Python 2
148 compat_chr
= unichr # Python 2
153 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
154 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
155 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
156 'Accept-Encoding': 'gzip, deflate',
157 'Accept-Language': 'en-us,en;q=0.5',
160 def preferredencoding():
161 """Get preferred encoding.
163 Returns the best encoding scheme for the system, based on
164 locale.getpreferredencoding() and some further tweaks.
167 pref
= locale
.getpreferredencoding()
174 if sys
.version_info
< (3,0):
176 print(s
.encode(preferredencoding(), 'xmlcharrefreplace'))
179 assert type(s
) == type(u
'')
182 # In Python 2.x, json.dump expects a bytestream.
183 # In Python 3.x, it writes to a character stream
184 if sys
.version_info
< (3,0):
185 def write_json_file(obj
, fn
):
186 with open(fn
, 'wb') as f
:
189 def write_json_file(obj
, fn
):
190 with open(fn
, 'w', encoding
='utf-8') as f
:
193 def htmlentity_transform(matchobj
):
194 """Transforms an HTML entity to a character.
196 This function receives a match object and is intended to be used with
197 the re.sub() function.
199 entity
= matchobj
.group(1)
201 # Known non-numeric HTML entity
202 if entity
in compat_html_entities
.name2codepoint
:
203 return compat_chr(compat_html_entities
.name2codepoint
[entity
])
205 mobj
= re
.match(u
'(?u)#(x?\\d+)', entity
)
207 numstr
= mobj
.group(1)
208 if numstr
.startswith(u
'x'):
210 numstr
= u
'0%s' % numstr
213 return compat_chr(int(numstr
, base
))
215 # Unknown entity in name, return its literal representation
216 return (u
'&%s;' % entity
)
218 compat_html_parser
.locatestarttagend
= re
.compile(r
"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re
.VERBOSE
) # backport bugfix
219 class AttrParser(compat_html_parser
.HTMLParser
):
220 """Modified HTMLParser that isolates a tag with the specified attribute"""
221 def __init__(self
, attribute
, value
):
222 self
.attribute
= attribute
228 self
.watch_startpos
= False
230 compat_html_parser
.HTMLParser
.__init
__(self
)
232 def error(self
, message
):
233 if self
.error_count
> 10 or self
.started
:
234 raise compat_html_parser
.HTMLParseError(message
, self
.getpos())
235 self
.rawdata
= '\n'.join(self
.html
.split('\n')[self
.getpos()[0]:]) # skip one line
236 self
.error_count
+= 1
239 def loads(self
, html
):
244 def handle_starttag(self
, tag
, attrs
):
247 self
.find_startpos(None)
248 if self
.attribute
in attrs
and attrs
[self
.attribute
] == self
.value
:
251 self
.watch_startpos
= True
253 if not tag
in self
.depth
: self
.depth
[tag
] = 0
256 def handle_endtag(self
, tag
):
258 if tag
in self
.depth
: self
.depth
[tag
] -= 1
259 if self
.depth
[self
.result
[0]] == 0:
261 self
.result
.append(self
.getpos())
263 def find_startpos(self
, x
):
264 """Needed to put the start position of the result (self.result[1])
265 after the opening tag with the requested id"""
266 if self
.watch_startpos
:
267 self
.watch_startpos
= False
268 self
.result
.append(self
.getpos())
269 handle_entityref
= handle_charref
= handle_data
= handle_comment
= \
270 handle_decl
= handle_pi
= unknown_decl
= find_startpos
272 def get_result(self
):
273 if self
.result
is None:
275 if len(self
.result
) != 3:
277 lines
= self
.html
.split('\n')
278 lines
= lines
[self
.result
[1][0]-1:self
.result
[2][0]]
279 lines
[0] = lines
[0][self
.result
[1][1]:]
281 lines
[-1] = lines
[-1][:self
.result
[2][1]-self
.result
[1][1]]
282 lines
[-1] = lines
[-1][:self
.result
[2][1]]
283 return '\n'.join(lines
).strip()
284 # Hack for https://github.com/rg3/youtube-dl/issues/662
285 if sys
.version_info
< (2, 7, 3):
286 AttrParser
.parse_endtag
= (lambda self
, i
:
287 i
+ len("</scr'+'ipt>")
288 if self
.rawdata
[i
:].startswith("</scr'+'ipt>")
289 else compat_html_parser
.HTMLParser
.parse_endtag(self
, i
))
291 def get_element_by_id(id, html
):
292 """Return the content of the tag with the specified ID in the passed HTML document"""
293 return get_element_by_attribute("id", id, html
)
295 def get_element_by_attribute(attribute
, value
, html
):
296 """Return the content of the tag with the specified attribute in the passed HTML document"""
297 parser
= AttrParser(attribute
, value
)
300 except compat_html_parser
.HTMLParseError
:
302 return parser
.get_result()
305 def clean_html(html
):
306 """Clean an HTML snippet into a readable string"""
308 html
= html
.replace('\n', ' ')
309 html
= re
.sub(r
'\s*<\s*br\s*/?\s*>\s*', '\n', html
)
310 html
= re
.sub(r
'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html
)
312 html
= re
.sub('<.*?>', '', html
)
313 # Replace html entities
314 html
= unescapeHTML(html
)
318 def sanitize_open(filename
, open_mode
):
319 """Try to open the given filename, and slightly tweak it if this fails.
321 Attempts to open the given filename. If this fails, it tries to change
322 the filename slightly, step by step, until it's either able to open it
323 or it fails and raises a final exception, like the standard open()
326 It returns the tuple (stream, definitive_file_name).
330 if sys
.platform
== 'win32':
332 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
333 return (sys
.stdout
.buffer if hasattr(sys
.stdout
, 'buffer') else sys
.stdout
, filename
)
334 stream
= open(encodeFilename(filename
), open_mode
)
335 return (stream
, filename
)
336 except (IOError, OSError) as err
:
337 # In case of error, try to remove win32 forbidden chars
338 filename
= re
.sub(u
'[/<>:"\\|\\\\?\\*]', u
'#', filename
)
340 # An exception here should be caught in the caller
341 stream
= open(encodeFilename(filename
), open_mode
)
342 return (stream
, filename
)
345 def timeconvert(timestr
):
346 """Convert RFC 2822 defined time string into system timestamp"""
348 timetuple
= email
.utils
.parsedate_tz(timestr
)
349 if timetuple
is not None:
350 timestamp
= email
.utils
.mktime_tz(timetuple
)
353 def sanitize_filename(s
, restricted
=False, is_id
=False):
354 """Sanitizes a string so it could be used as part of a filename.
355 If restricted is set, use a stricter subset of allowed characters.
356 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
358 def replace_insane(char
):
359 if char
== '?' or ord(char
) < 32 or ord(char
) == 127:
362 return '' if restricted
else '\''
364 return '_-' if restricted
else ' -'
365 elif char
in '\\/|*<>':
367 if restricted
and (char
in '!&\'()[]{}$;`^,#' or char
.isspace()):
369 if restricted
and ord(char
) > 127:
373 result
= u
''.join(map(replace_insane
, s
))
375 while '__' in result
:
376 result
= result
.replace('__', '_')
377 result
= result
.strip('_')
378 # Common case of "Foreign band name - English song title"
379 if restricted
and result
.startswith('-_'):
385 def orderedSet(iterable
):
386 """ Remove all duplicates from the input iterable """
397 assert type(s
) == type(u
'')
399 result
= re
.sub(u
'(?u)&(.+?);', htmlentity_transform
, s
)
402 def encodeFilename(s
):
404 @param s The name of the file
407 assert type(s
) == type(u
'')
409 # Python 3 has a Unicode API
410 if sys
.version_info
>= (3, 0):
413 if sys
.platform
== 'win32' and sys
.getwindowsversion()[0] >= 5:
414 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
415 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
416 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
419 encoding
= sys
.getfilesystemencoding()
422 return s
.encode(encoding
, 'ignore')
424 def decodeOption(optval
):
427 if isinstance(optval
, bytes):
428 optval
= optval
.decode(preferredencoding())
430 assert isinstance(optval
, compat_str
)
433 def formatSeconds(secs
):
435 return '%d:%02d:%02d' % (secs
// 3600, (secs
% 3600) // 60, secs
% 60)
437 return '%d:%02d' % (secs
// 60, secs
% 60)
441 class ExtractorError(Exception):
442 """Error during info extraction."""
443 def __init__(self
, msg
, tb
=None):
444 """ tb, if given, is the original traceback (so that it can be printed out). """
445 super(ExtractorError
, self
).__init
__(msg
)
447 self
.exc_info
= sys
.exc_info() # preserve original exception
449 def format_traceback(self
):
450 if self
.traceback
is None:
452 return u
''.join(traceback
.format_tb(self
.traceback
))
455 class DownloadError(Exception):
456 """Download Error exception.
458 This exception may be thrown by FileDownloader objects if they are not
459 configured to continue on errors. They will contain the appropriate
462 def __init__(self
, msg
, exc_info
=None):
463 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
464 super(DownloadError
, self
).__init
__(msg
)
465 self
.exc_info
= exc_info
468 class SameFileError(Exception):
469 """Same File exception.
471 This exception will be thrown by FileDownloader objects if they detect
472 multiple files would have to be downloaded to the same file on disk.
477 class PostProcessingError(Exception):
478 """Post Processing exception.
480 This exception may be raised by PostProcessor's .run() method to
481 indicate an error in the postprocessing task.
483 def __init__(self
, msg
):
486 class MaxDownloadsReached(Exception):
487 """ --max-downloads limit has been reached. """
491 class UnavailableVideoError(Exception):
492 """Unavailable Format exception.
494 This exception will be thrown when a video is requested
495 in a format that is not available for that video.
500 class ContentTooShortError(Exception):
501 """Content Too Short exception.
503 This exception may be raised by FileDownloader objects when a file they
504 download is too small for what the server announced first, indicating
505 the connection was probably interrupted.
511 def __init__(self
, downloaded
, expected
):
512 self
.downloaded
= downloaded
513 self
.expected
= expected
515 class YoutubeDLHandler(compat_urllib_request
.HTTPHandler
):
516 """Handler for HTTP requests and responses.
518 This class, when installed with an OpenerDirector, automatically adds
519 the standard headers to every HTTP request and handles gzipped and
520 deflated responses from web servers. If compression is to be avoided in
521 a particular request, the original request in the program code only has
522 to include the HTTP header "Youtubedl-No-Compression", which will be
523 removed before making the real request.
525 Part of this code was copied from:
527 http://techknack.net/python-urllib2-handlers/
529 Andrew Rowls, the author of that code, agreed to release it to the
536 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
538 return zlib
.decompress(data
)
541 def addinfourl_wrapper(stream
, headers
, url
, code
):
542 if hasattr(compat_urllib_request
.addinfourl
, 'getcode'):
543 return compat_urllib_request
.addinfourl(stream
, headers
, url
, code
)
544 ret
= compat_urllib_request
.addinfourl(stream
, headers
, url
)
548 def http_request(self
, req
):
549 for h
,v
in std_headers
.items():
553 if 'Youtubedl-no-compression' in req
.headers
:
554 if 'Accept-encoding' in req
.headers
:
555 del req
.headers
['Accept-encoding']
556 del req
.headers
['Youtubedl-no-compression']
557 if 'Youtubedl-user-agent' in req
.headers
:
558 if 'User-agent' in req
.headers
:
559 del req
.headers
['User-agent']
560 req
.headers
['User-agent'] = req
.headers
['Youtubedl-user-agent']
561 del req
.headers
['Youtubedl-user-agent']
564 def http_response(self
, req
, resp
):
567 if resp
.headers
.get('Content-encoding', '') == 'gzip':
568 gz
= gzip
.GzipFile(fileobj
=io
.BytesIO(resp
.read()), mode
='r')
569 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
570 resp
.msg
= old_resp
.msg
572 if resp
.headers
.get('Content-encoding', '') == 'deflate':
573 gz
= io
.BytesIO(self
.deflate(resp
.read()))
574 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
575 resp
.msg
= old_resp
.msg
578 https_request
= http_request
579 https_response
= http_response
581 def unified_strdate(date_str
):
582 """Return a string with the date in the format YYYYMMDD"""
585 date_str
= date_str
.replace(',',' ')
586 # %z (UTC offset) is only supported in python>=3.2
587 date_str
= re
.sub(r
' (\+|-)[\d]*$', '', date_str
)
588 format_expressions
= ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S']
589 for expression
in format_expressions
:
591 upload_date
= datetime
.datetime
.strptime(date_str
, expression
).strftime('%Y%m%d')
596 def date_from_str(date_str
):
598 Return a datetime object from a string in the format YYYYMMDD or
599 (now|today)[+-][0-9](day|week|month|year)(s)?"""
600 today
= datetime
.date
.today()
601 if date_str
== 'now'or date_str
== 'today':
603 match
= re
.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str
)
604 if match
is not None:
605 sign
= match
.group('sign')
606 time
= int(match
.group('time'))
609 unit
= match
.group('unit')
618 delta
= datetime
.timedelta(**{unit: time}
)
620 return datetime
.datetime
.strptime(date_str
, "%Y%m%d").date()
622 class DateRange(object):
623 """Represents a time interval between two dates"""
624 def __init__(self
, start
=None, end
=None):
625 """start and end must be strings in the format accepted by date"""
626 if start
is not None:
627 self
.start
= date_from_str(start
)
629 self
.start
= datetime
.datetime
.min.date()
631 self
.end
= date_from_str(end
)
633 self
.end
= datetime
.datetime
.max.date()
634 if self
.start
> self
.end
:
635 raise ValueError('Date range: "%s" , the start date must be before the end date' % self
)
638 """Returns a range that only contains the given day"""
640 def __contains__(self
, date
):
641 """Check if the date is in the range"""
642 if not isinstance(date
, datetime
.date
):
643 date
= date_from_str(date
)
644 return self
.start
<= date
<= self
.end
646 return '%s - %s' % ( self
.start
.isoformat(), self
.end
.isoformat())