16 from .common
import InfoExtractor
, SearchInfoExtractor
17 from .subtitles
import SubtitlesInfoExtractor
24 compat_urllib_request
,
31 get_element_by_attribute
,
39 class YoutubeBaseInfoExtractor(InfoExtractor
):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL
= 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL
= r
'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE
= 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED
= False
48 def report_lang(self
):
49 """Report attempt to set language."""
50 self
.to_screen(u
'Setting language')
52 def _set_language(self
):
53 request
= compat_urllib_request
.Request(self
._LANG
_URL
)
56 compat_urllib_request
.urlopen(request
).read()
57 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
58 self
._downloader
.report_warning(u
'unable to set language: %s' % compat_str(err
))
63 (username
, password
) = self
._get
_login
_info
()
64 # No authentication to be performed
66 if self
._LOGIN
_REQUIRED
:
67 raise ExtractorError(u
'No login info available, needed for using %s.' % self
.IE_NAME
, expected
=True)
70 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
)
72 login_page
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
73 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
74 self
._downloader
.report_warning(u
'unable to fetch login page: %s' % compat_str(err
))
77 galx
= self
._search
_regex
(r
'(?s)<input.+?name="GALX".+?value="(.+?)"',
78 login_page
, u
'Login GALX parameter')
82 u
'continue': u
'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
86 u
'PersistentCookie': u
'yes',
88 u
'bgresponse': u
'js_disabled',
89 u
'checkConnection': u
'',
90 u
'checkedDomains': u
'youtube',
95 u
'signIn': u
'Sign in',
97 u
'service': u
'youtube',
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
103 login_form
= dict((k
.encode('utf-8'), v
.encode('utf-8')) for k
,v
in login_form_strs
.items())
104 login_data
= compat_urllib_parse
.urlencode(login_form
).encode('ascii')
105 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
, login_data
)
108 login_results
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
109 if re
.search(r
'(?i)<form[^>]* id="gaia_loginform"', login_results
) is not None:
110 self
._downloader
.report_warning(u
'unable to log in: bad username or password')
112 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
113 self
._downloader
.report_warning(u
'unable to log in: %s' % compat_str(err
))
117 def _confirm_age(self
):
120 'action_confirm': 'Confirm',
122 request
= compat_urllib_request
.Request(self
._AGE
_URL
, compat_urllib_parse
.urlencode(age_form
))
124 self
.report_age_confirmation()
125 compat_urllib_request
.urlopen(request
).read().decode('utf-8')
126 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
127 raise ExtractorError(u
'Unable to confirm age: %s' % compat_str(err
))
130 def _real_initialize(self
):
131 if self
._downloader
is None:
133 if not self
._set
_language
():
135 if not self
._login
():
140 class YoutubeIE(YoutubeBaseInfoExtractor
, SubtitlesInfoExtractor
):
141 IE_DESC
= u
'YouTube.com'
142 _VALID_URL
= r
"""(?x)^
144 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
145 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
146 tube\.majestyc\.net/|
147 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
148 (?:.*?\#/)? # handle anchor (#/) redirect urls
149 (?: # the various things that can precede the ID:
150 (?:(?:v|embed|e)/) # v/ or embed/ or e/
151 |(?: # or the v= param in all its forms
152 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
153 (?:\?|\#!?) # the params delimiter ? or # or #!
154 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
158 |youtu\.be/ # just youtu.be/xxxx
160 )? # all until now is optional -> you can pass the naked ID
161 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
162 (?(1).+)? # if we found the ID, everything can follow
164 _NEXT_URL_RE
= r
'[\?&]next_url=([^&]+)'
165 # Listed in order of quality
166 _available_formats
= ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
167 # Apple HTTP Live Streaming
168 '96', '95', '94', '93', '92', '132', '151',
170 '85', '84', '102', '83', '101', '82', '100',
172 '138', '137', '248', '136', '247', '135', '246',
173 '245', '244', '134', '243', '133', '242', '160',
175 '141', '172', '140', '171', '139',
177 _available_formats_prefer_free
= ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
178 # Apple HTTP Live Streaming
179 '96', '95', '94', '93', '92', '132', '151',
181 '85', '102', '84', '101', '83', '100', '82',
183 '138', '248', '137', '247', '136', '246', '245',
184 '244', '135', '243', '134', '242', '133', '160',
186 '172', '141', '171', '140', '139',
188 _video_formats_map
= {
189 'flv': ['35', '34', '6', '5'],
190 '3gp': ['36', '17', '13'],
191 'mp4': ['38', '37', '22', '18'],
192 'webm': ['46', '45', '44', '43'],
194 _video_extensions
= {
216 # Apple HTTP Live Streaming
250 _video_dimensions
= {
332 u
"url": u
"http://www.youtube.com/watch?v=BaW_jenozKc",
333 u
"file": u
"BaW_jenozKc.mp4",
335 u
"title": u
"youtube-dl test video \"'/\\ä↭𝕐",
336 u
"uploader": u
"Philipp Hagemeister",
337 u
"uploader_id": u
"phihag",
338 u
"upload_date": u
"20121002",
339 u
"description": u
"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
343 u
"url": u
"http://www.youtube.com/watch?v=UxxajLWwzqY",
344 u
"file": u
"UxxajLWwzqY.mp4",
345 u
"note": u
"Test generic use_cipher_signature video (#897)",
347 u
"upload_date": u
"20120506",
348 u
"title": u
"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
349 u
"description": u
"md5:5b292926389560516e384ac437c0ec07",
350 u
"uploader": u
"Icona Pop",
351 u
"uploader_id": u
"IconaPop"
355 u
"url": u
"https://www.youtube.com/watch?v=07FYdnEawAQ",
356 u
"file": u
"07FYdnEawAQ.mp4",
357 u
"note": u
"Test VEVO video with age protection (#956)",
359 u
"upload_date": u
"20130703",
360 u
"title": u
"Justin Timberlake - Tunnel Vision (Explicit)",
361 u
"description": u
"md5:64249768eec3bc4276236606ea996373",
362 u
"uploader": u
"justintimberlakeVEVO",
363 u
"uploader_id": u
"justintimberlakeVEVO"
367 u
"url": u
"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
368 u
"file": u
"yZIXLfi8CZQ.mp4",
369 u
"note": u
"Embed-only video (#1746)",
371 u
"upload_date": u
"20120608",
372 u
"title": u
"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
373 u
"description": u
"md5:09b78bd971f1e3e289601dfba15ca4f7",
374 u
"uploader": u
"SET India",
375 u
"uploader_id": u
"setindia"
382 def suitable(cls
, url
):
383 """Receives a URL and returns True if suitable for this IE."""
384 if YoutubePlaylistIE
.suitable(url
): return False
385 return re
.match(cls
._VALID
_URL
, url
) is not None
387 def __init__(self
, *args
, **kwargs
):
388 super(YoutubeIE
, self
).__init
__(*args
, **kwargs
)
389 self
._player
_cache
= {}
391 def report_video_webpage_download(self
, video_id
):
392 """Report attempt to download video webpage."""
393 self
.to_screen(u
'%s: Downloading video webpage' % video_id
)
395 def report_video_info_webpage_download(self
, video_id
):
396 """Report attempt to download video info webpage."""
397 self
.to_screen(u
'%s: Downloading video info webpage' % video_id
)
399 def report_information_extraction(self
, video_id
):
400 """Report attempt to extract video information."""
401 self
.to_screen(u
'%s: Extracting video information' % video_id
)
403 def report_unavailable_format(self
, video_id
, format
):
404 """Report extracted video URL."""
405 self
.to_screen(u
'%s: Format %s not available' % (video_id
, format
))
407 def report_rtmp_download(self
):
408 """Indicate the download will use the RTMP protocol."""
409 self
.to_screen(u
'RTMP download detected')
411 def _extract_signature_function(self
, video_id
, player_url
, slen
):
412 id_m
= re
.match(r
'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
414 player_type
= id_m
.group('ext')
415 player_id
= id_m
.group('id')
417 # Read from filesystem cache
418 func_id
= '%s_%s_%d' % (player_type
, player_id
, slen
)
419 assert os
.path
.basename(func_id
) == func_id
420 cache_dir
= get_cachedir(self
._downloader
.params
)
422 cache_enabled
= cache_dir
is not None
424 cache_fn
= os
.path
.join(os
.path
.expanduser(cache_dir
),
428 with io
.open(cache_fn
, 'r', encoding
='utf-8') as cachef
:
429 cache_spec
= json
.load(cachef
)
430 return lambda s
: u
''.join(s
[i
] for i
in cache_spec
)
432 pass # No cache available
434 if player_type
== 'js':
435 code
= self
._download
_webpage
(
436 player_url
, video_id
,
437 note
=u
'Downloading %s player %s' % (player_type
, player_id
),
438 errnote
=u
'Download of %s failed' % player_url
)
439 res
= self
._parse
_sig
_js
(code
)
440 elif player_type
== 'swf':
441 urlh
= self
._request
_webpage
(
442 player_url
, video_id
,
443 note
=u
'Downloading %s player %s' % (player_type
, player_id
),
444 errnote
=u
'Download of %s failed' % player_url
)
446 res
= self
._parse
_sig
_swf
(code
)
448 assert False, 'Invalid player type %r' % player_type
452 test_string
= u
''.join(map(compat_chr
, range(slen
)))
453 cache_res
= res(test_string
)
454 cache_spec
= [ord(c
) for c
in cache_res
]
456 os
.makedirs(os
.path
.dirname(cache_fn
))
457 except OSError as ose
:
458 if ose
.errno
!= errno
.EEXIST
:
460 write_json_file(cache_spec
, cache_fn
)
462 tb
= traceback
.format_exc()
463 self
._downloader
.report_warning(
464 u
'Writing cache to %r failed: %s' % (cache_fn
, tb
))
468 def _print_sig_code(self
, func
, slen
):
469 def gen_sig_code(idxs
):
470 def _genslice(start
, end
, step
):
471 starts
= u
'' if start
== 0 else str(start
)
472 ends
= (u
':%d' % (end
+step
)) if end
+ step
>= 0 else u
':'
473 steps
= u
'' if step
== 1 else (u
':%d' % step
)
474 return u
's[%s%s%s]' % (starts
, ends
, steps
)
477 start
= '(Never used)' # Quelch pyflakes warnings - start will be
478 # set as soon as step is set
479 for i
, prev
in zip(idxs
[1:], idxs
[:-1]):
483 yield _genslice(start
, prev
, step
)
486 if i
- prev
in [-1, 1]:
491 yield u
's[%d]' % prev
495 yield _genslice(start
, i
, step
)
497 test_string
= u
''.join(map(compat_chr
, range(slen
)))
498 cache_res
= func(test_string
)
499 cache_spec
= [ord(c
) for c
in cache_res
]
500 expr_code
= u
' + '.join(gen_sig_code(cache_spec
))
501 code
= u
'if len(s) == %d:\n return %s\n' % (slen
, expr_code
)
502 self
.to_screen(u
'Extracted signature function:\n' + code
)
504 def _parse_sig_js(self
, jscode
):
505 funcname
= self
._search
_regex
(
506 r
'signature=([a-zA-Z]+)', jscode
,
507 u
'Initial JS player signature function name')
512 return string
.lowercase
.index(varname
)
514 def interpret_statement(stmt
, local_vars
, allow_recursion
=20):
515 if allow_recursion
< 0:
516 raise ExtractorError(u
'Recursion limit reached')
518 if stmt
.startswith(u
'var '):
519 stmt
= stmt
[len(u
'var '):]
520 ass_m
= re
.match(r
'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
521 r
'=(?P<expr>.*)$', stmt
)
523 if ass_m
.groupdict().get('index'):
525 lvar
= local_vars
[ass_m
.group('out')]
526 idx
= interpret_expression(ass_m
.group('index'),
527 local_vars
, allow_recursion
)
528 assert isinstance(idx
, int)
531 expr
= ass_m
.group('expr')
534 local_vars
[ass_m
.group('out')] = val
536 expr
= ass_m
.group('expr')
537 elif stmt
.startswith(u
'return '):
539 expr
= stmt
[len(u
'return '):]
541 raise ExtractorError(
542 u
'Cannot determine left side of statement in %r' % stmt
)
544 v
= interpret_expression(expr
, local_vars
, allow_recursion
)
547 def interpret_expression(expr
, local_vars
, allow_recursion
):
552 return local_vars
[expr
]
554 m
= re
.match(r
'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr
)
556 member
= m
.group('member')
557 val
= local_vars
[m
.group('in')]
558 if member
== 'split("")':
560 if member
== 'join("")':
562 if member
== 'length':
564 if member
== 'reverse()':
566 slice_m
= re
.match(r
'slice\((?P<idx>.*)\)', member
)
568 idx
= interpret_expression(
569 slice_m
.group('idx'), local_vars
, allow_recursion
-1)
573 r
'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr
)
575 val
= local_vars
[m
.group('in')]
576 idx
= interpret_expression(m
.group('idx'), local_vars
,
580 m
= re
.match(r
'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr
)
582 a
= interpret_expression(m
.group('a'),
583 local_vars
, allow_recursion
)
584 b
= interpret_expression(m
.group('b'),
585 local_vars
, allow_recursion
)
589 r
'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr
)
591 fname
= m
.group('func')
592 if fname
not in functions
:
593 functions
[fname
] = extract_function(fname
)
594 argvals
= [int(v
) if v
.isdigit() else local_vars
[v
]
595 for v
in m
.group('args').split(',')]
596 return functions
[fname
](argvals
)
597 raise ExtractorError(u
'Unsupported JS expression %r' % expr
)
599 def extract_function(funcname
):
601 r
'function ' + re
.escape(funcname
) +
602 r
'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
604 argnames
= func_m
.group('args').split(',')
607 local_vars
= dict(zip(argnames
, args
))
608 for stmt
in func_m
.group('code').split(';'):
609 res
= interpret_statement(stmt
, local_vars
)
613 initial_function
= extract_function(funcname
)
614 return lambda s
: initial_function([s
])
616 def _parse_sig_swf(self
, file_contents
):
617 if file_contents
[1:3] != b
'WS':
618 raise ExtractorError(
619 u
'Not an SWF file; header is %r' % file_contents
[:3])
620 if file_contents
[:1] == b
'C':
621 content
= zlib
.decompress(file_contents
[8:])
623 raise NotImplementedError(u
'Unsupported compression format %r' %
626 def extract_tags(content
):
628 while pos
< len(content
):
629 header16
= struct
.unpack('<H', content
[pos
:pos
+2])[0]
631 tag_code
= header16
>> 6
632 tag_len
= header16
& 0x3f
634 tag_len
= struct
.unpack('<I', content
[pos
:pos
+4])[0]
636 assert pos
+tag_len
<= len(content
)
637 yield (tag_code
, content
[pos
:pos
+tag_len
])
641 for tag_code
, tag
in extract_tags(content
)
643 p
= code_tag
.index(b
'\0', 4) + 1
644 code_reader
= io
.BytesIO(code_tag
[p
:])
646 # Parse ABC (AVM2 ByteCode)
647 def read_int(reader
=None):
655 b
= struct
.unpack('<B', buf
)[0]
656 res
= res |
((b
& 0x7f) << shift
)
662 def u30(reader
=None):
663 res
= read_int(reader
)
664 assert res
& 0xf0000000 == 0
668 def s32(reader
=None):
670 if v
& 0x80000000 != 0:
671 v
= - ((v ^
0xffffffff) + 1)
674 def read_string(reader
=None):
678 resb
= reader
.read(slen
)
679 assert len(resb
) == slen
680 return resb
.decode('utf-8')
682 def read_bytes(count
, reader
=None):
685 resb
= reader
.read(count
)
686 assert len(resb
) == count
689 def read_byte(reader
=None):
690 resb
= read_bytes(1, reader
=reader
)
691 res
= struct
.unpack('<B', resb
)[0]
694 # minor_version + major_version
699 for _c
in range(1, int_count
):
702 for _c
in range(1, uint_count
):
705 read_bytes((double_count
-1) * 8)
707 constant_strings
= [u
'']
708 for _c
in range(1, string_count
):
710 constant_strings
.append(s
)
711 namespace_count
= u30()
712 for _c
in range(1, namespace_count
):
716 for _c
in range(1, ns_set_count
):
718 for _c2
in range(count
):
720 multiname_count
= u30()
729 0x0e: 2, # MultinameA
730 0x1b: 1, # MultinameL
731 0x1c: 1, # MultinameLA
734 for _c
in range(1, multiname_count
):
736 assert kind
in MULTINAME_SIZES
, u
'Invalid multiname kind %r' % kind
738 u30() # namespace_idx
740 multinames
.append(constant_strings
[name_idx
])
742 multinames
.append('[MULTINAME kind: %d]' % kind
)
743 for _c2
in range(MULTINAME_SIZES
[kind
]):
748 MethodInfo
= collections
.namedtuple(
750 ['NEED_ARGUMENTS', 'NEED_REST'])
752 for method_id
in range(method_count
):
755 for _
in range(param_count
):
757 u30() # name index (always 0 for youtube)
759 if flags
& 0x08 != 0:
762 for c
in range(option_count
):
765 if flags
& 0x80 != 0:
766 # Param names present
767 for _
in range(param_count
):
769 mi
= MethodInfo(flags
& 0x01 != 0, flags
& 0x04 != 0)
770 method_infos
.append(mi
)
773 metadata_count
= u30()
774 for _c
in range(metadata_count
):
777 for _c2
in range(item_count
):
781 def parse_traits_info():
782 trait_name_idx
= u30()
783 kind_full
= read_byte()
784 kind
= kind_full
& 0x0f
785 attrs
= kind_full
>> 4
787 if kind
in [0x00, 0x06]: # Slot or Const
789 u30() # type_name_idx
793 elif kind
in [0x01, 0x02, 0x03]: # Method / Getter / Setter
796 methods
[multinames
[trait_name_idx
]] = method_idx
797 elif kind
== 0x04: # Class
800 elif kind
== 0x05: # Function
803 methods
[function_idx
] = multinames
[trait_name_idx
]
805 raise ExtractorError(u
'Unsupported trait kind %d' % kind
)
807 if attrs
& 0x4 != 0: # Metadata present
808 metadata_count
= u30()
809 for _c3
in range(metadata_count
):
810 u30() # metadata index
815 TARGET_CLASSNAME
= u
'SignatureDecipher'
816 searched_idx
= multinames
.index(TARGET_CLASSNAME
)
817 searched_class_id
= None
819 for class_id
in range(class_count
):
821 if name_idx
== searched_idx
:
822 # We found the class we're looking for!
823 searched_class_id
= class_id
824 u30() # super_name idx
826 if flags
& 0x08 != 0: # Protected namespace is present
827 u30() # protected_ns_idx
829 for _c2
in range(intrf_count
):
833 for _c2
in range(trait_count
):
836 if searched_class_id
is None:
837 raise ExtractorError(u
'Target class %r not found' %
842 for class_id
in range(class_count
):
845 for _c2
in range(trait_count
):
846 trait_methods
= parse_traits_info()
847 if class_id
== searched_class_id
:
848 method_names
.update(trait_methods
.items())
849 method_idxs
.update(dict(
851 for name
, idx
in trait_methods
.items()))
855 for _c
in range(script_count
):
858 for _c2
in range(trait_count
):
862 method_body_count
= u30()
863 Method
= collections
.namedtuple('Method', ['code', 'local_count'])
865 for _c
in range(method_body_count
):
869 u30() # init_scope_depth
870 u30() # max_scope_depth
872 code
= read_bytes(code_length
)
873 if method_idx
in method_idxs
:
874 m
= Method(code
, local_count
)
875 methods
[method_idxs
[method_idx
]] = m
876 exception_count
= u30()
877 for _c2
in range(exception_count
):
884 for _c2
in range(trait_count
):
887 assert p
+ code_reader
.tell() == len(code_tag
)
888 assert len(methods
) == len(method_idxs
)
890 method_pyfunctions
= {}
892 def extract_function(func_name
):
893 if func_name
in method_pyfunctions
:
894 return method_pyfunctions
[func_name
]
895 if func_name
not in methods
:
896 raise ExtractorError(u
'Cannot find function %r' % func_name
)
897 m
= methods
[func_name
]
900 registers
= ['(this)'] + list(args
) + [None] * m
.local_count
902 coder
= io
.BytesIO(m
.code
)
904 opcode
= struct
.unpack('!B', coder
.read(1))[0]
905 if opcode
== 36: # pushbyte
906 v
= struct
.unpack('!B', coder
.read(1))[0]
908 elif opcode
== 44: # pushstring
910 stack
.append(constant_strings
[idx
])
911 elif opcode
== 48: # pushscope
912 # We don't implement the scope register, so we'll just
913 # ignore the popped value
915 elif opcode
== 70: # callproperty
917 mname
= multinames
[index
]
918 arg_count
= u30(coder
)
919 args
= list(reversed(
920 [stack
.pop() for _
in range(arg_count
)]))
922 if mname
== u
'split':
923 assert len(args
) == 1
924 assert isinstance(args
[0], compat_str
)
925 assert isinstance(obj
, compat_str
)
929 res
= obj
.split(args
[0])
931 elif mname
== u
'slice':
932 assert len(args
) == 1
933 assert isinstance(args
[0], int)
934 assert isinstance(obj
, list)
937 elif mname
== u
'join':
938 assert len(args
) == 1
939 assert isinstance(args
[0], compat_str
)
940 assert isinstance(obj
, list)
941 res
= args
[0].join(obj
)
943 elif mname
in method_pyfunctions
:
944 stack
.append(method_pyfunctions
[mname
](args
))
946 raise NotImplementedError(
947 u
'Unsupported property %r on %r'
949 elif opcode
== 72: # returnvalue
952 elif opcode
== 79: # callpropvoid
954 mname
= multinames
[index
]
955 arg_count
= u30(coder
)
956 args
= list(reversed(
957 [stack
.pop() for _
in range(arg_count
)]))
959 if mname
== u
'reverse':
960 assert isinstance(obj
, list)
963 raise NotImplementedError(
964 u
'Unsupported (void) property %r on %r'
966 elif opcode
== 93: # findpropstrict
968 mname
= multinames
[index
]
969 res
= extract_function(mname
)
971 elif opcode
== 97: # setproperty
976 assert isinstance(obj
, list)
977 assert isinstance(idx
, int)
979 elif opcode
== 98: # getlocal
981 stack
.append(registers
[index
])
982 elif opcode
== 99: # setlocal
985 registers
[index
] = value
986 elif opcode
== 102: # getproperty
988 pname
= multinames
[index
]
989 if pname
== u
'length':
991 assert isinstance(obj
, list)
992 stack
.append(len(obj
))
993 else: # Assume attribute access
995 assert isinstance(idx
, int)
997 assert isinstance(obj
, list)
998 stack
.append(obj
[idx
])
999 elif opcode
== 128: # coerce
1001 elif opcode
== 133: # coerce_s
1002 assert isinstance(stack
[-1], (type(None), compat_str
))
1003 elif opcode
== 164: # modulo
1004 value2
= stack
.pop()
1005 value1
= stack
.pop()
1006 res
= value1
% value2
1008 elif opcode
== 208: # getlocal_0
1009 stack
.append(registers
[0])
1010 elif opcode
== 209: # getlocal_1
1011 stack
.append(registers
[1])
1012 elif opcode
== 210: # getlocal_2
1013 stack
.append(registers
[2])
1014 elif opcode
== 211: # getlocal_3
1015 stack
.append(registers
[3])
1016 elif opcode
== 214: # setlocal_2
1017 registers
[2] = stack
.pop()
1018 elif opcode
== 215: # setlocal_3
1019 registers
[3] = stack
.pop()
1021 raise NotImplementedError(
1022 u
'Unsupported opcode %d' % opcode
)
1024 method_pyfunctions
[func_name
] = resfunc
1027 initial_function
= extract_function(u
'decipher')
1028 return lambda s
: initial_function([s
])
1030 def _decrypt_signature(self
, s
, video_id
, player_url
, age_gate
=False):
1031 """Turn the encrypted s field into a working signature"""
1033 if player_url
is not None:
1034 if player_url
.startswith(u
'//'):
1035 player_url
= u
'https:' + player_url
1037 player_id
= (player_url
, len(s
))
1038 if player_id
not in self
._player
_cache
:
1039 func
= self
._extract
_signature
_function
(
1040 video_id
, player_url
, len(s
)
1042 self
._player
_cache
[player_id
] = func
1043 func
= self
._player
_cache
[player_id
]
1044 if self
._downloader
.params
.get('youtube_print_sig_code'):
1045 self
._print
_sig
_code
(func
, len(s
))
1048 tb
= traceback
.format_exc()
1049 self
._downloader
.report_warning(
1050 u
'Automatic signature extraction failed: ' + tb
)
1052 self
._downloader
.report_warning(
1053 u
'Warning: Falling back to static signature algorithm')
1055 return self
._static
_decrypt
_signature
(
1056 s
, video_id
, player_url
, age_gate
)
1058 def _static_decrypt_signature(self
, s
, video_id
, player_url
, age_gate
):
1060 # The videos with age protection use another player, so the
1061 # algorithms can be different.
1063 return s
[2:63] + s
[82] + s
[64:82] + s
[63]
1066 return s
[86:29:-1] + s
[88] + s
[28:5:-1]
1068 return s
[25] + s
[3:25] + s
[0] + s
[26:42] + s
[79] + s
[43:79] + s
[91] + s
[80:83]
1070 return s
[84:27:-1] + s
[86] + s
[26:5:-1]
1072 return s
[25] + s
[3:25] + s
[2] + s
[26:40] + s
[77] + s
[41:77] + s
[89] + s
[78:81]
1074 return s
[84:78:-1] + s
[87] + s
[77:60:-1] + s
[0] + s
[59:3:-1]
1076 return s
[7:28] + s
[87] + s
[29:45] + s
[55] + s
[46:55] + s
[2] + s
[56:87] + s
[28]
1078 return s
[6:27] + s
[4] + s
[28:39] + s
[27] + s
[40:59] + s
[2] + s
[60:]
1080 return s
[80:72:-1] + s
[16] + s
[71:39:-1] + s
[72] + s
[38:16:-1] + s
[82] + s
[15::-1]
1082 return s
[3:11] + s
[0] + s
[12:55] + s
[84] + s
[56:84]
1084 return s
[78:70:-1] + s
[14] + s
[69:37:-1] + s
[70] + s
[36:14:-1] + s
[80] + s
[:14][::-1]
1086 return s
[80:63:-1] + s
[0] + s
[62:0:-1] + s
[63]
1088 return s
[80:37:-1] + s
[7] + s
[36:7:-1] + s
[0] + s
[6:0:-1] + s
[37]
1090 return s
[56] + s
[79:56:-1] + s
[41] + s
[55:41:-1] + s
[80] + s
[40:34:-1] + s
[0] + s
[33:29:-1] + s
[34] + s
[28:9:-1] + s
[29] + s
[8:0:-1] + s
[9]
1092 return s
[1:19] + s
[0] + s
[20:68] + s
[19] + s
[69:80]
1094 return s
[54] + s
[77:54:-1] + s
[39] + s
[53:39:-1] + s
[78] + s
[38:34:-1] + s
[0] + s
[33:29:-1] + s
[34] + s
[28:9:-1] + s
[29] + s
[8:0:-1] + s
[9]
1097 raise ExtractorError(u
'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s
)))
1099 def _get_available_subtitles(self
, video_id
, webpage
):
1101 sub_list
= self
._download
_webpage
(
1102 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id
,
1103 video_id
, note
=False)
1104 except ExtractorError
as err
:
1105 self
._downloader
.report_warning(u
'unable to download video subtitles: %s' % compat_str(err
))
1107 lang_list
= re
.findall(r
'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list
)
1112 params
= compat_urllib_parse
.urlencode({
1115 'fmt': self
._downloader
.params
.get('subtitlesformat', 'srt'),
1116 'name': l
[0].encode('utf-8'),
1118 url
= u
'http://www.youtube.com/api/timedtext?' + params
1119 sub_lang_list
[lang
] = url
1120 if not sub_lang_list
:
1121 self
._downloader
.report_warning(u
'video doesn\'t have subtitles')
1123 return sub_lang_list
1125 def _get_available_automatic_caption(self
, video_id
, webpage
):
1126 """We need the webpage for getting the captions url, pass it as an
1127 argument to speed up the process."""
1128 sub_format
= self
._downloader
.params
.get('subtitlesformat', 'srt')
1129 self
.to_screen(u
'%s: Looking for automatic captions' % video_id
)
1130 mobj
= re
.search(r
';ytplayer.config = ({.*?});', webpage
)
1131 err_msg
= u
'Couldn\'t find automatic captions for %s' % video_id
1133 self
._downloader
.report_warning(err_msg
)
1135 player_config
= json
.loads(mobj
.group(1))
1137 args
= player_config
[u
'args']
1138 caption_url
= args
[u
'ttsurl']
1139 timestamp
= args
[u
'timestamp']
1140 # We get the available subtitles
1141 list_params
= compat_urllib_parse
.urlencode({
1146 list_url
= caption_url
+ '&' + list_params
1147 caption_list
= self
._download
_xml
(list_url
, video_id
)
1148 original_lang_node
= caption_list
.find('track')
1149 if original_lang_node
is None or original_lang_node
.attrib
.get('kind') != 'asr' :
1150 self
._downloader
.report_warning(u
'Video doesn\'t have automatic captions')
1152 original_lang
= original_lang_node
.attrib
['lang_code']
1155 for lang_node
in caption_list
.findall('target'):
1156 sub_lang
= lang_node
.attrib
['lang_code']
1157 params
= compat_urllib_parse
.urlencode({
1158 'lang': original_lang
,
1164 sub_lang_list
[sub_lang
] = caption_url
+ '&' + params
1165 return sub_lang_list
1166 # An extractor error can be raise by the download process if there are
1167 # no automatic captions but there are subtitles
1168 except (KeyError, ExtractorError
):
1169 self
._downloader
.report_warning(err_msg
)
1172 def _print_formats(self
, formats
):
1173 print('Available formats:')
1175 print('%s\t:\t%s\t[%s]%s' %(x
, self
._video
_extensions
.get(x
, 'flv'),
1176 self
._video
_dimensions
.get(x
, '???'),
1177 ' ('+self
._special
_itags
[x
]+')' if x
in self
._special
_itags
else ''))
1179 def _extract_id(self
, url
):
1180 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
1182 raise ExtractorError(u
'Invalid URL: %s' % url
)
1183 video_id
= mobj
.group(2)
1186 def _get_video_url_list(self
, url_map
):
1188 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1189 with the requested formats.
1191 req_format
= self
._downloader
.params
.get('format', None)
1192 format_limit
= self
._downloader
.params
.get('format_limit', None)
1193 available_formats
= self
._available
_formats
_prefer
_free
if self
._downloader
.params
.get('prefer_free_formats', False) else self
._available
_formats
1194 if format_limit
is not None and format_limit
in available_formats
:
1195 format_list
= available_formats
[available_formats
.index(format_limit
):]
1197 format_list
= available_formats
1198 existing_formats
= [x
for x
in format_list
if x
in url_map
]
1199 if len(existing_formats
) == 0:
1200 raise ExtractorError(u
'no known formats available for video')
1201 if self
._downloader
.params
.get('listformats', None):
1202 self
._print
_formats
(existing_formats
)
1204 if req_format
is None or req_format
== 'best':
1205 video_url_list
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality
1206 elif req_format
== 'worst':
1207 video_url_list
= [(existing_formats
[-1], url_map
[existing_formats
[-1]])] # worst quality
1208 elif req_format
in ('-1', 'all'):
1209 video_url_list
= [(f
, url_map
[f
]) for f
in existing_formats
] # All formats
1211 # Specific formats. We pick the first in a slash-delimeted sequence.
1212 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1213 # available in the specified format. For example,
1214 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1215 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1216 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1217 req_formats
= req_format
.split('/')
1218 video_url_list
= None
1219 for rf
in req_formats
:
1221 video_url_list
= [(rf
, url_map
[rf
])]
1223 if rf
in self
._video
_formats
_map
:
1224 for srf
in self
._video
_formats
_map
[rf
]:
1226 video_url_list
= [(srf
, url_map
[srf
])]
1231 if video_url_list
is None:
1232 raise ExtractorError(u
'requested format not available')
1233 return video_url_list
1235 def _extract_from_m3u8(self
, manifest_url
, video_id
):
1237 def _get_urls(_manifest
):
1238 lines
= _manifest
.split('\n')
1239 urls
= filter(lambda l
: l
and not l
.startswith('#'),
1242 manifest
= self
._download
_webpage
(manifest_url
, video_id
, u
'Downloading formats manifest')
1243 formats_urls
= _get_urls(manifest
)
1244 for format_url
in formats_urls
:
1245 itag
= self
._search
_regex
(r
'itag/(\d+?)/', format_url
, 'itag')
1246 url_map
[itag
] = format_url
1249 def _extract_annotations(self
, video_id
):
1250 url
= 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1251 return self
._download
_webpage
(url
, video_id
, note
=u
'Searching for annotations.', errnote
=u
'Unable to download video annotations.')
1253 def _real_extract(self
, url
):
1254 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1255 mobj
= re
.search(self
._NEXT
_URL
_RE
, url
)
1257 url
= 'https://www.youtube.com/' + compat_urllib_parse
.unquote(mobj
.group(1)).lstrip('/')
1258 video_id
= self
._extract
_id
(url
)
1261 self
.report_video_webpage_download(video_id
)
1262 url
= 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1263 request
= compat_urllib_request
.Request(url
)
1265 video_webpage_bytes
= compat_urllib_request
.urlopen(request
).read()
1266 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1267 raise ExtractorError(u
'Unable to download video webpage: %s' % compat_str(err
))
1269 video_webpage
= video_webpage_bytes
.decode('utf-8', 'ignore')
1271 # Attempt to extract SWF player URL
1272 mobj
= re
.search(r
'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage
)
1273 if mobj
is not None:
1274 player_url
= re
.sub(r
'\\(.)', r
'\1', mobj
.group(1))
1279 self
.report_video_info_webpage_download(video_id
)
1280 if re
.search(r
'player-age-gate-content">', video_webpage
) is not None:
1281 self
.report_age_confirmation()
1283 # We simulate the access to the video from www.youtube.com/v/{video_id}
1284 # this can be viewed without login into Youtube
1285 data
= compat_urllib_parse
.urlencode({'video_id': video_id
,
1286 'el': 'player_embedded',
1289 'eurl': 'https://youtube.googleapis.com/v/' + video_id
,
1293 video_info_url
= 'https://www.youtube.com/get_video_info?' + data
1294 video_info_webpage
= self
._download
_webpage
(video_info_url
, video_id
,
1296 errnote
='unable to download video info webpage')
1297 video_info
= compat_parse_qs(video_info_webpage
)
1300 for el_type
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1301 video_info_url
= ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1302 % (video_id
, el_type
))
1303 video_info_webpage
= self
._download
_webpage
(video_info_url
, video_id
,
1305 errnote
='unable to download video info webpage')
1306 video_info
= compat_parse_qs(video_info_webpage
)
1307 if 'token' in video_info
:
1309 if 'token' not in video_info
:
1310 if 'reason' in video_info
:
1311 raise ExtractorError(u
'YouTube said: %s' % video_info
['reason'][0], expected
=True)
1313 raise ExtractorError(u
'"token" parameter not in video info for unknown reason')
1315 if 'view_count' in video_info
:
1316 view_count
= int(video_info
['view_count'][0])
1320 # Check for "rental" videos
1321 if 'ypc_video_rental_bar_text' in video_info
and 'author' not in video_info
:
1322 raise ExtractorError(u
'"rental" videos not supported')
1324 # Start extracting information
1325 self
.report_information_extraction(video_id
)
1328 if 'author' not in video_info
:
1329 raise ExtractorError(u
'Unable to extract uploader name')
1330 video_uploader
= compat_urllib_parse
.unquote_plus(video_info
['author'][0])
1333 video_uploader_id
= None
1334 mobj
= re
.search(r
'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage
)
1335 if mobj
is not None:
1336 video_uploader_id
= mobj
.group(1)
1338 self
._downloader
.report_warning(u
'unable to extract uploader nickname')
1341 if 'title' in video_info
:
1342 video_title
= compat_urllib_parse
.unquote_plus(video_info
['title'][0])
1344 self
._downloader
.report_warning(u
'Unable to extract video title')
1348 # We try first to get a high quality image:
1349 m_thumb
= re
.search(r
'<span itemprop="thumbnail".*?href="(.*?)">',
1350 video_webpage
, re
.DOTALL
)
1351 if m_thumb
is not None:
1352 video_thumbnail
= m_thumb
.group(1)
1353 elif 'thumbnail_url' not in video_info
:
1354 self
._downloader
.report_warning(u
'unable to extract video thumbnail')
1355 video_thumbnail
= None
1356 else: # don't panic if we can't find it
1357 video_thumbnail
= compat_urllib_parse
.unquote_plus(video_info
['thumbnail_url'][0])
1361 mobj
= re
.search(r
'id="eow-date.*?>(.*?)</span>', video_webpage
, re
.DOTALL
)
1362 if mobj
is not None:
1363 upload_date
= ' '.join(re
.sub(r
'[/,-]', r
' ', mobj
.group(1)).split())
1364 upload_date
= unified_strdate(upload_date
)
1367 video_description
= get_element_by_id("eow-description", video_webpage
)
1368 if video_description
:
1369 video_description
= clean_html(video_description
)
1371 fd_mobj
= re
.search(r
'<meta name="description" content="([^"]+)"', video_webpage
)
1373 video_description
= unescapeHTML(fd_mobj
.group(1))
1375 video_description
= u
''
1378 video_subtitles
= self
.extract_subtitles(video_id
, video_webpage
)
1380 if self
._downloader
.params
.get('listsubtitles', False):
1381 self
._list
_available
_subtitles
(video_id
, video_webpage
)
1384 if 'length_seconds' not in video_info
:
1385 self
._downloader
.report_warning(u
'unable to extract video duration')
1388 video_duration
= compat_urllib_parse
.unquote_plus(video_info
['length_seconds'][0])
1391 video_annotations
= None
1392 if self
._downloader
.params
.get('writeannotations', False):
1393 video_annotations
= self
._extract
_annotations
(video_id
)
1395 # Decide which formats to download
1398 mobj
= re
.search(r
';ytplayer.config = ({.*?});', video_webpage
)
1400 raise ValueError('Could not find vevo ID')
1401 info
= json
.loads(mobj
.group(1))
1403 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1404 # this signatures are encrypted
1405 if 'url_encoded_fmt_stream_map' not in args
:
1406 raise ValueError(u
'No stream_map present') # caught below
1407 re_signature
= re
.compile(r
'[&,]s=')
1408 m_s
= re_signature
.search(args
['url_encoded_fmt_stream_map'])
1410 self
.to_screen(u
'%s: Encrypted signatures detected.' % video_id
)
1411 video_info
['url_encoded_fmt_stream_map'] = [args
['url_encoded_fmt_stream_map']]
1412 m_s
= re_signature
.search(args
.get('adaptive_fmts', u
''))
1414 if 'adaptive_fmts' in video_info
:
1415 video_info
['adaptive_fmts'][0] += ',' + args
['adaptive_fmts']
1417 video_info
['adaptive_fmts'] = [args
['adaptive_fmts']]
1421 if 'conn' in video_info
and video_info
['conn'][0].startswith('rtmp'):
1422 self
.report_rtmp_download()
1423 video_url_list
= [(None, video_info
['conn'][0])]
1424 elif len(video_info
.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info
.get('adaptive_fmts', [])) >= 1:
1425 encoded_url_map
= video_info
.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info
.get('adaptive_fmts',[''])[0]
1426 if 'rtmpe%3Dyes' in encoded_url_map
:
1427 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected
=True)
1429 for url_data_str
in encoded_url_map
.split(','):
1430 url_data
= compat_parse_qs(url_data_str
)
1431 if 'itag' in url_data
and 'url' in url_data
:
1432 url
= url_data
['url'][0]
1433 if 'sig' in url_data
:
1434 url
+= '&signature=' + url_data
['sig'][0]
1435 elif 's' in url_data
:
1436 encrypted_sig
= url_data
['s'][0]
1437 if self
._downloader
.params
.get('verbose'):
1439 if player_url
is None:
1440 player_version
= 'unknown'
1442 player_version
= self
._search
_regex
(
1443 r
'-(.+)\.swf$', player_url
,
1444 u
'flash player', fatal
=False)
1445 player_desc
= 'flash player %s' % player_version
1447 player_version
= self
._search
_regex
(
1448 r
'html5player-(.+?)\.js', video_webpage
,
1449 'html5 player', fatal
=False)
1450 player_desc
= u
'html5 player %s' % player_version
1452 parts_sizes
= u
'.'.join(compat_str(len(part
)) for part
in encrypted_sig
.split('.'))
1453 self
.to_screen(u
'encrypted signature length %d (%s), itag %s, %s' %
1454 (len(encrypted_sig
), parts_sizes
, url_data
['itag'][0], player_desc
))
1457 jsplayer_url_json
= self
._search
_regex
(
1458 r
'"assets":.+?"js":\s*("[^"]+")',
1459 video_webpage
, u
'JS player URL')
1460 player_url
= json
.loads(jsplayer_url_json
)
1462 signature
= self
._decrypt
_signature
(
1463 encrypted_sig
, video_id
, player_url
, age_gate
)
1464 url
+= '&signature=' + signature
1465 if 'ratebypass' not in url
:
1466 url
+= '&ratebypass=yes'
1467 url_map
[url_data
['itag'][0]] = url
1468 video_url_list
= self
._get
_video
_url
_list
(url_map
)
1469 if not video_url_list
:
1471 elif video_info
.get('hlsvp'):
1472 manifest_url
= video_info
['hlsvp'][0]
1473 url_map
= self
._extract
_from
_m
3u8(manifest_url
, video_id
)
1474 video_url_list
= self
._get
_video
_url
_list
(url_map
)
1475 if not video_url_list
:
1479 raise ExtractorError(u
'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1482 for itag
, video_real_url
in video_url_list
:
1484 video_extension
= self
._video
_extensions
.get(itag
, 'flv')
1486 video_format
= '{0} - {1}{2}'.format(itag
if itag
else video_extension
,
1487 self
._video
_dimensions
.get(itag
, '???'),
1488 ' ('+self
._special
_itags
[itag
]+')' if itag
in self
._special
_itags
else '')
1492 'url': video_real_url
,
1493 'uploader': video_uploader
,
1494 'uploader_id': video_uploader_id
,
1495 'upload_date': upload_date
,
1496 'title': video_title
,
1497 'ext': video_extension
,
1498 'format': video_format
,
1500 'thumbnail': video_thumbnail
,
1501 'description': video_description
,
1502 'player_url': player_url
,
1503 'subtitles': video_subtitles
,
1504 'duration': video_duration
,
1505 'age_limit': 18 if age_gate
else 0,
1506 'annotations': video_annotations
,
1507 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id
,
1508 'view_count': view_count
,
1512 class YoutubePlaylistIE(YoutubeBaseInfoExtractor
):
1513 IE_DESC
= u
'YouTube.com playlists'
1514 _VALID_URL
= r
"""(?:
1519 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1520 \? (?:.*?&)*? (?:p|a|list)=
1523 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1526 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1528 _TEMPLATE_URL
= 'https://www.youtube.com/playlist?list=%s&page=%s'
1529 _MORE_PAGES_INDICATOR
= r
'data-link-type="next"'
1530 _VIDEO_RE
= r
'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)'
1531 IE_NAME
= u
'youtube:playlist'
1534 def suitable(cls
, url
):
1535 """Receives a URL and returns True if suitable for this IE."""
1536 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
1538 def _real_initialize(self
):
1541 def _ids_to_results(self
, ids
):
1542 return [self
.url_result(vid_id
, 'Youtube', video_id
=vid_id
)
1545 def _extract_mix(self
, playlist_id
):
1546 # The mixes are generated from a a single video
1547 # the id of the playlist is just 'RD' + video_id
1548 url
= 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id
[2:], playlist_id
)
1549 webpage
= self
._download
_webpage
(url
, playlist_id
, u
'Downloading Youtube mix')
1550 title_span
= (get_element_by_attribute('class', 'title long-title', webpage
) or
1551 get_element_by_attribute('class', 'title ', webpage
))
1552 title
= clean_html(title_span
)
1553 video_re
= r
'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re
.escape(playlist_id
)
1554 ids
= orderedSet(re
.findall(video_re
, webpage
))
1555 url_results
= self
._ids
_to
_results
(ids
)
1557 return self
.playlist_result(url_results
, playlist_id
, title
)
1559 def _real_extract(self
, url
):
1560 # Extract playlist id
1561 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
1563 raise ExtractorError(u
'Invalid URL: %s' % url
)
1564 playlist_id
= mobj
.group(1) or mobj
.group(2)
1566 # Check if it's a video-specific URL
1567 query_dict
= compat_urlparse
.parse_qs(compat_urlparse
.urlparse(url
).query
)
1568 if 'v' in query_dict
:
1569 video_id
= query_dict
['v'][0]
1570 if self
._downloader
.params
.get('noplaylist'):
1571 self
.to_screen(u
'Downloading just video %s because of --no-playlist' % video_id
)
1572 return self
.url_result(video_id
, 'Youtube', video_id
=video_id
)
1574 self
.to_screen(u
'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id
, video_id
))
1576 if len(playlist_id
) == 13: # 'RD' + 11 characters for the video id
1577 # Mixes require a custom extraction process
1578 return self
._extract
_mix
(playlist_id
)
1580 # Extract the video ids from the playlist pages
1583 for page_num
in itertools
.count(1):
1584 url
= self
._TEMPLATE
_URL
% (playlist_id
, page_num
)
1585 page
= self
._download
_webpage
(url
, playlist_id
, u
'Downloading page #%s' % page_num
)
1586 matches
= re
.finditer(self
._VIDEO
_RE
, page
)
1587 # We remove the duplicates and the link with index 0
1588 # (it's not the first video of the playlist)
1589 new_ids
= orderedSet(m
.group('id') for m
in matches
if m
.group('index') != '0')
1592 if re
.search(self
._MORE
_PAGES
_INDICATOR
, page
) is None:
1595 playlist_title
= self
._og
_search
_title
(page
)
1597 url_results
= self
._ids
_to
_results
(ids
)
1598 return self
.playlist_result(url_results
, playlist_id
, playlist_title
)
1601 class YoutubeChannelIE(InfoExtractor
):
1602 IE_DESC
= u
'YouTube.com channels'
1603 _VALID_URL
= r
"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1604 _MORE_PAGES_INDICATOR
= 'yt-uix-load-more'
1605 _MORE_PAGES_URL
= 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1606 IE_NAME
= u
'youtube:channel'
1608 def extract_videos_from_page(self
, page
):
1610 for mobj
in re
.finditer(r
'href="/watch\?v=([0-9A-Za-z_-]+)&?', page
):
1611 if mobj
.group(1) not in ids_in_page
:
1612 ids_in_page
.append(mobj
.group(1))
1615 def _real_extract(self
, url
):
1616 # Extract channel id
1617 mobj
= re
.match(self
._VALID
_URL
, url
)
1619 raise ExtractorError(u
'Invalid URL: %s' % url
)
1621 # Download channel page
1622 channel_id
= mobj
.group(1)
1624 url
= 'https://www.youtube.com/channel/%s/videos' % channel_id
1625 channel_page
= self
._download
_webpage
(url
, channel_id
)
1626 if re
.search(r
'channel-header-autogenerated-label', channel_page
) is not None:
1627 autogenerated
= True
1629 autogenerated
= False
1632 # The videos are contained in a single page
1633 # the ajax pages can't be used, they are empty
1634 video_ids
= self
.extract_videos_from_page(channel_page
)
1636 # Download all channel pages using the json-based channel_ajax query
1637 for pagenum
in itertools
.count(1):
1638 url
= self
._MORE
_PAGES
_URL
% (pagenum
, channel_id
)
1639 page
= self
._download
_webpage
(url
, channel_id
,
1640 u
'Downloading page #%s' % pagenum
)
1642 page
= json
.loads(page
)
1644 ids_in_page
= self
.extract_videos_from_page(page
['content_html'])
1645 video_ids
.extend(ids_in_page
)
1647 if self
._MORE
_PAGES
_INDICATOR
not in page
['load_more_widget_html']:
1650 self
._downloader
.to_screen(u
'[youtube] Channel %s: Found %i videos' % (channel_id
, len(video_ids
)))
1652 url_entries
= [self
.url_result(video_id
, 'Youtube', video_id
=video_id
)
1653 for video_id
in video_ids
]
1654 return self
.playlist_result(url_entries
, channel_id
)
1657 class YoutubeUserIE(InfoExtractor
):
1658 IE_DESC
= u
'YouTube.com user videos (URL or "ytuser" keyword)'
1659 _VALID_URL
= r
'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1660 _TEMPLATE_URL
= 'http://gdata.youtube.com/feeds/api/users/%s'
1661 _GDATA_PAGE_SIZE
= 50
1662 _GDATA_URL
= 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1663 IE_NAME
= u
'youtube:user'
1666 def suitable(cls
, url
):
1667 # Don't return True if the url can be extracted with other youtube
1668 # extractor, the regex would is too permissive and it would match.
1669 other_ies
= iter(klass
for (name
, klass
) in globals().items() if name
.endswith('IE') and klass
is not cls
)
1670 if any(ie
.suitable(url
) for ie
in other_ies
): return False
1671 else: return super(YoutubeUserIE
, cls
).suitable(url
)
1673 def _real_extract(self
, url
):
1675 mobj
= re
.match(self
._VALID
_URL
, url
)
1677 raise ExtractorError(u
'Invalid URL: %s' % url
)
1679 username
= mobj
.group(1)
1681 # Download video ids using YouTube Data API. Result size per
1682 # query is limited (currently to 50 videos) so we need to query
1683 # page by page until there are no video ids - it means we got
1688 for pagenum
in itertools
.count(0):
1689 start_index
= pagenum
* self
._GDATA
_PAGE
_SIZE
+ 1
1691 gdata_url
= self
._GDATA
_URL
% (username
, self
._GDATA
_PAGE
_SIZE
, start_index
)
1692 page
= self
._download
_webpage
(gdata_url
, username
,
1693 u
'Downloading video ids from %d to %d' % (start_index
, start_index
+ self
._GDATA
_PAGE
_SIZE
))
1696 response
= json
.loads(page
)
1697 except ValueError as err
:
1698 raise ExtractorError(u
'Invalid JSON in API response: ' + compat_str(err
))
1699 if 'entry' not in response
['feed']:
1700 # Number of videos is a multiple of self._MAX_RESULTS
1703 # Extract video identifiers
1705 for entry
in response
['feed']['entry']:
1706 ids_in_page
.append(entry
['id']['$t'].split('/')[-1])
1707 video_ids
.extend(ids_in_page
)
1709 # A little optimization - if current page is not
1710 # "full", ie. does not contain PAGE_SIZE video ids then
1711 # we can assume that this page is the last one - there
1712 # are no more ids on further pages - no need to query
1715 if len(ids_in_page
) < self
._GDATA
_PAGE
_SIZE
:
1719 self
.url_result(video_id
, 'Youtube', video_id
=video_id
)
1720 for video_id
in video_ids
]
1721 return self
.playlist_result(url_results
, playlist_title
=username
)
1724 class YoutubeSearchIE(SearchInfoExtractor
):
1725 IE_DESC
= u
'YouTube.com searches'
1726 _API_URL
= 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1728 IE_NAME
= u
'youtube:search'
1729 _SEARCH_KEY
= 'ytsearch'
1731 def report_download_page(self
, query
, pagenum
):
1732 """Report attempt to download search page with given number."""
1733 self
._downloader
.to_screen(u
'[youtube] query "%s": Downloading page %s' % (query
, pagenum
))
1735 def _get_n_results(self
, query
, n
):
1736 """Get a specified number of results for a query"""
1742 while (50 * pagenum
) < limit
:
1743 self
.report_download_page(query
, pagenum
+1)
1744 result_url
= self
._API
_URL
% (compat_urllib_parse
.quote_plus(query
), (50*pagenum
)+1)
1745 request
= compat_urllib_request
.Request(result_url
)
1747 data
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
1748 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1749 raise ExtractorError(u
'Unable to download API page: %s' % compat_str(err
))
1750 api_response
= json
.loads(data
)['data']
1752 if not 'items' in api_response
:
1753 raise ExtractorError(u
'[youtube] No video results')
1755 new_ids
= list(video
['id'] for video
in api_response
['items'])
1756 video_ids
+= new_ids
1758 limit
= min(n
, api_response
['totalItems'])
1761 if len(video_ids
) > n
:
1762 video_ids
= video_ids
[:n
]
1763 videos
= [self
.url_result(video_id
, 'Youtube', video_id
=video_id
)
1764 for video_id
in video_ids
]
1765 return self
.playlist_result(videos
, query
)
1767 class YoutubeSearchDateIE(YoutubeSearchIE
):
1768 _API_URL
= 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1769 _SEARCH_KEY
= 'ytsearchdate'
1770 IE_DESC
= u
'YouTube.com searches, newest videos first'
1772 class YoutubeShowIE(InfoExtractor
):
1773 IE_DESC
= u
'YouTube.com (multi-season) shows'
1774 _VALID_URL
= r
'https?://www\.youtube\.com/show/(.*)'
1775 IE_NAME
= u
'youtube:show'
1777 def _real_extract(self
, url
):
1778 mobj
= re
.match(self
._VALID
_URL
, url
)
1779 show_name
= mobj
.group(1)
1780 webpage
= self
._download
_webpage
(url
, show_name
, u
'Downloading show webpage')
1781 # There's one playlist for each season of the show
1782 m_seasons
= list(re
.finditer(r
'href="(/playlist\?list=.*?)"', webpage
))
1783 self
.to_screen(u
'%s: Found %s seasons' % (show_name
, len(m_seasons
)))
1784 return [self
.url_result('https://www.youtube.com' + season
.group(1), 'YoutubePlaylist') for season
in m_seasons
]
1787 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor
):
1789 Base class for extractors that fetch info from
1790 http://www.youtube.com/feed_ajax
1791 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1793 _LOGIN_REQUIRED
= True
1795 # use action_load_personal_feed instead of action_load_system_feed
1796 _PERSONAL_FEED
= False
1799 def _FEED_TEMPLATE(self
):
1800 action
= 'action_load_system_feed'
1801 if self
._PERSONAL
_FEED
:
1802 action
= 'action_load_personal_feed'
1803 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action
, self
._FEED
_NAME
)
1807 return u
'youtube:%s' % self
._FEED
_NAME
1809 def _real_initialize(self
):
1812 def _real_extract(self
, url
):
1814 # The step argument is available only in 2.7 or higher
1815 for i
in itertools
.count(0):
1816 paging
= i
*self
._PAGING
_STEP
1817 info
= self
._download
_webpage
(self
._FEED
_TEMPLATE
% paging
,
1818 u
'%s feed' % self
._FEED
_NAME
,
1819 u
'Downloading page %s' % i
)
1820 info
= json
.loads(info
)
1821 feed_html
= info
['feed_html']
1822 m_ids
= re
.finditer(r
'"/watch\?v=(.*?)["&]', feed_html
)
1823 ids
= orderedSet(m
.group(1) for m
in m_ids
)
1824 feed_entries
.extend(
1825 self
.url_result(video_id
, 'Youtube', video_id
=video_id
)
1826 for video_id
in ids
)
1827 if info
['paging'] is None:
1829 return self
.playlist_result(feed_entries
, playlist_title
=self
._PLAYLIST
_TITLE
)
1831 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor
):
1832 IE_DESC
= u
'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1833 _VALID_URL
= r
'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1834 _FEED_NAME
= 'subscriptions'
1835 _PLAYLIST_TITLE
= u
'Youtube Subscriptions'
1837 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor
):
1838 IE_DESC
= u
'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1839 _VALID_URL
= r
'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1840 _FEED_NAME
= 'recommended'
1841 _PLAYLIST_TITLE
= u
'Youtube Recommended videos'
1843 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor
):
1844 IE_DESC
= u
'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1845 _VALID_URL
= r
'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1846 _FEED_NAME
= 'watch_later'
1847 _PLAYLIST_TITLE
= u
'Youtube Watch Later'
1849 _PERSONAL_FEED
= True
1851 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor
):
1852 IE_DESC
= u
'Youtube watch history, "ythistory" keyword (requires authentication)'
1853 _VALID_URL
= u
'https?://www\.youtube\.com/feed/history|:ythistory'
1854 _FEED_NAME
= 'history'
1855 _PERSONAL_FEED
= True
1856 _PLAYLIST_TITLE
= u
'Youtube Watch History'
1858 def _real_extract(self
, url
):
1859 webpage
= self
._download
_webpage
('https://www.youtube.com/feed/history', u
'History')
1860 data_paging
= self
._search
_regex
(r
'data-paging="(\d+)"', webpage
, u
'data-paging')
1861 # The step is actually a ridiculously big number (like 1374343569725646)
1862 self
._PAGING
_STEP
= int(data_paging
)
1863 return super(YoutubeHistoryIE
, self
)._real
_extract
(url
)
1865 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor
):
1866 IE_NAME
= u
'youtube:favorites'
1867 IE_DESC
= u
'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1868 _VALID_URL
= r
'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1869 _LOGIN_REQUIRED
= True
1871 def _real_extract(self
, url
):
1872 webpage
= self
._download
_webpage
('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1873 playlist_id
= self
._search
_regex
(r
'list=(.+?)["&]', webpage
, u
'favourites playlist id')
1874 return self
.url_result(playlist_id
, 'YoutubePlaylist')
1877 class YoutubeTruncatedURLIE(InfoExtractor
):
1878 IE_NAME
= 'youtube:truncated_url'
1879 IE_DESC
= False # Do not list
1880 _VALID_URL
= r
'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1882 def _real_extract(self
, url
):
1883 raise ExtractorError(
1884 u
'Did you forget to quote the URL? Remember that & is a meta '
1885 u
'character in most shells, so you want to put the URL in quotes, '
1887 u
'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1888 u
' (or simply youtube-dl BaW_jenozKc ).',