2 from __future__
import unicode_literals
7 from .common
import InfoExtractor
8 from .youtube
import YoutubeIE
10 compat_urllib_parse_unquote
,
11 compat_urllib_parse_unquote_plus
,
36 class ArchiveOrgIE(InfoExtractor
):
37 IE_NAME
= 'archive.org'
38 IE_DESC
= 'archive.org video and audio'
39 _VALID_URL
= r
'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^?#]+)(?:[?].*)?$'
41 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
42 'md5': '8af1d4cf447933ed3c7f4871162602db',
44 'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect',
46 'title': '1968 Demo - FJCC Conference Presentation Reel #1',
47 'description': 'md5:da45c349df039f1cc8075268eb1b5c25',
48 'release_date': '19681210',
49 'timestamp': 1268695290,
50 'upload_date': '20100315',
51 'creator': 'SRI International',
52 'uploader': 'laura@archive.org',
55 'url': 'https://archive.org/details/Cops1922',
56 'md5': '0869000b4ce265e8ca62738b336b268a',
60 'title': 'Buster Keaton\'s "Cops" (1922)',
61 'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c',
62 'uploader': 'yorkmba99@hotmail.com',
63 'timestamp': 1387699629,
64 'upload_date': "20131222",
67 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
68 'only_matching': True,
70 'url': 'https://archive.org/details/Election_Ads',
71 'md5': '284180e857160cf866358700bab668a3',
73 'id': 'Election_Ads/Commercial-JFK1960ElectionAdCampaignJingle.mpg',
74 'title': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg',
78 'url': 'https://archive.org/details/Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
79 'md5': '7915213ef02559b5501fe630e1a53f59',
81 'id': 'Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
82 'title': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg',
84 'timestamp': 1205588045,
85 'uploader': 'mikedavisstripmaster@yahoo.com',
86 'description': '1960 Presidential Campaign Election Commercials John F Kennedy, Richard M Nixon',
87 'upload_date': '20080315',
90 'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16',
91 'md5': '7d07ffb42aba6537c28e053efa4b54c9',
93 'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t01.flac',
98 'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',
99 'md5': 'a07cd8c6ab4ee1560f8a0021717130f3',
101 'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',
104 'timestamp': 1205895624,
105 'uploader': 'mvernon54@yahoo.com',
106 'description': 'md5:6a31f1996db0aa0fc9da6d6e708a1bb0',
107 'upload_date': '20080319',
108 'location': 'Barton Hall - Cornell University',
111 'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik',
112 'md5': '7cb019baa9b332e82ea7c10403acd180',
114 'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/01.01. Bells Of Rostov.mp3',
115 'title': 'Bells Of Rostov',
119 'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02.+Song+And+Chorus+In+The+Polovetsian+Camp+From+%22Prince+Igor%22+(Act+2%2C+Scene+1).mp3',
120 'md5': '1d0aabe03edca83ca58d9ed3b493a3c3',
122 'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02. Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1).mp3',
123 'title': 'Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1)',
125 'timestamp': 1569662587,
126 'uploader': 'associate-joygen-odiongan@archive.org',
127 'description': 'md5:012b2d668ae753be36896f343d12a236',
128 'upload_date': '20190928',
133 def _playlist_data(webpage
):
134 element
= re
.findall(r
'''(?xs)
136 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
137 \s+class=['"]?js-play8-playlist['"]?
138 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
142 return json
.loads(extract_attributes(element
)['value'])
144 def _real_extract(self
, url
):
145 video_id
= compat_urllib_parse_unquote_plus(self
._match
_id
(url
))
146 identifier
, entry_id
= (video_id
.split('/', 1) + [None])[:2]
148 # Archive.org metadata API doesn't clearly demarcate playlist entries
149 # or subtitle tracks, so we get them from the embeddable player.
150 embed_page
= self
._download
_webpage
(
151 'https://archive.org/embed/' + identifier
, identifier
)
152 playlist
= self
._playlist
_data
(embed_page
)
156 # If the user specified a playlist entry in the URL, ignore the
157 # rest of the playlist.
158 if entry_id
and p
['orig'] != entry_id
:
161 entries
[p
['orig']] = {
164 'artist': p
.get('artist'),
165 'track': p
.get('title'),
168 for track
in p
.get('tracks', []):
169 if track
['kind'] != 'subtitles':
172 entries
[p
['orig']][track
['label']] = {
173 'url': 'https://archive.org/' + track
['file'].lstrip('/')}
175 metadata
= self
._download
_json
(
176 'http://archive.org/metadata/' + identifier
, identifier
)
177 m
= metadata
['metadata']
178 identifier
= m
['identifier']
183 'description': clean_html(m
.get('description')),
184 'uploader': dict_get(m
, ['uploader', 'adder']),
185 'creator': m
.get('creator'),
186 'license': m
.get('licenseurl'),
187 'release_date': unified_strdate(m
.get('date')),
188 'timestamp': unified_timestamp(dict_get(m
, ['publicdate', 'addeddate'])),
189 'webpage_url': 'https://archive.org/details/' + identifier
,
190 'location': m
.get('venue'),
191 'release_year': int_or_none(m
.get('year'))}
193 for f
in metadata
['files']:
194 if f
['name'] in entries
:
195 entries
[f
['name']] = merge_dicts(entries
[f
['name']], {
196 'id': identifier
+ '/' + f
['name'],
197 'title': f
.get('title') or f
['name'],
198 'display_id': f
['name'],
199 'description': clean_html(f
.get('description')),
200 'creator': f
.get('creator'),
201 'duration': parse_duration(f
.get('length')),
202 'track_number': int_or_none(f
.get('track')),
203 'album': f
.get('album'),
204 'discnumber': int_or_none(f
.get('disc')),
205 'release_year': int_or_none(f
.get('year'))})
206 entry
= entries
[f
['name']]
207 elif f
.get('original') in entries
:
208 entry
= entries
[f
['original']]
212 if f
.get('format') == 'Thumbnail':
213 entry
['thumbnails'].append({
215 'url': 'https://archive.org/download/' + identifier
+ '/' + f
['name'],
216 'width': int_or_none(f
.get('width')),
217 'height': int_or_none(f
.get('width')),
218 'filesize': int_or_none(f
.get('size'))})
220 extension
= (f
['name'].rsplit('.', 1) + [None])[1]
221 if extension
in KNOWN_EXTENSIONS
:
222 entry
['formats'].append({
223 'url': 'https://archive.org/download/' + identifier
+ '/' + f
['name'],
224 'format': f
.get('format'),
225 'width': int_or_none(f
.get('width')),
226 'height': int_or_none(f
.get('height')),
227 'filesize': int_or_none(f
.get('size')),
228 'protocol': 'https'})
230 # Sort available formats by filesize
231 for entry
in entries
.values():
232 entry
['formats'] = list(sorted(entry
['formats'], key
=lambda x
: x
.get('filesize', -1)))
234 if len(entries
) == 1:
235 # If there's only one item, use it as the main info dict
236 only_video
= entries
[list(entries
.keys())[0]]
238 info
= merge_dicts(only_video
, info
)
240 info
= merge_dicts(info
, only_video
)
242 # Otherwise, we have a playlist.
243 info
['_type'] = 'playlist'
244 info
['entries'] = list(entries
.values())
246 if metadata
.get('reviews'):
247 info
['comments'] = []
248 for review
in metadata
['reviews']:
249 info
['comments'].append({
250 'id': review
.get('review_id'),
251 'author': review
.get('reviewer'),
252 'text': str_or_none(review
.get('reviewtitle'), '') + '\n\n' + review
.get('reviewbody'),
253 'timestamp': unified_timestamp(review
.get('createdate')),
259 class YoutubeWebArchiveIE(InfoExtractor
):
260 IE_NAME
= 'web.archive:youtube'
261 IE_DESC
= 'web.archive.org saved youtube videos'
262 _VALID_URL
= r
"""(?x)^
263 (?:https?://)?web\.archive\.org/
265 (?:[0-9A-Za-z_*]+/)? # /web and the version index is optional
267 (?:https?(?::|%3[Aa])//)?
269 (?:\w+\.)?youtube\.com/watch(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL
270 |(wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url
272 (?P<id>[0-9A-Za-z_-]{11})(?:%26|\#|&|$)
277 'url': 'https://web.archive.org/web/20150415002341/https://www.youtube.com/watch?v=aYAGB11YrSs',
281 'title': 'Team Fortress 2 - Sandviches!'
286 'url': 'https://web.archive.org/web/2oe/http://wayback-fakeurl.archive.org/yt/97t7Xj_iBv0',
290 'title': 'How Flexible Machines Could Save The World'
294 # Video from 2012, webm format itag 45.
295 'url': 'https://web.archive.org/web/20120712231619/http://www.youtube.com/watch?v=AkhihxRKcrs&gl=US&hl=en',
299 'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)'
303 # Old flash-only video. Webpage title starts with "YouTube - ".
304 'url': 'https://web.archive.org/web/20081211103536/http://www.youtube.com/watch?v=jNQXAC9IVRw',
307 'ext': 'unknown_video',
308 'title': 'Me at the zoo'
312 # Flash video with .flv extension (itag 34). Title has prefix "YouTube -"
313 # Title has some weird unicode characters too.
314 'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA',
318 'title': 'Madeon - Pop Culture (live mashup)'
321 { # Some versions of Youtube have have "YouTube" as page title in html (and later rewritten by js).
322 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw',
326 'title': 'kH-G_aIBlFw'
328 'expected_warnings': [
329 'unable to extract title',
333 # First capture is a 302 redirect intermediary page.
334 'url': 'https://web.archive.org/web/20050214000000/http://www.youtube.com/watch?v=0altSZ96U4M',
338 'title': '0altSZ96U4M'
340 'expected_warnings': [
341 'unable to extract title',
345 # Video not archived, only capture is unavailable video page
346 'url': 'https://web.archive.org/web/20210530071008/https://www.youtube.com/watch?v=lHJTf93HL1s&spfreload=10',
347 'only_matching': True,
350 'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fgl%3DUS%26v%3DAkhihxRKcrs%26hl%3Den',
351 'only_matching': True,
354 'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fv%3DAkhihxRKcrs%26gl%3DUS%26hl%3Den',
355 'only_matching': True,
359 def _real_extract(self
, url
):
360 video_id
= self
._match
_id
(url
)
361 title
= video_id
# if we are not able get a title
363 def _extract_title(webpage
):
364 page_title
= self
._html
_search
_regex
(
365 r
'<title>([^<]*)</title>', webpage
, 'title', fatal
=False) or ''
366 # YouTube video pages appear to always have either 'YouTube -' as suffix or '- YouTube' as prefix.
368 page_title
= self
._html
_search
_regex
(
369 r
'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)',
370 page_title
, 'title', default
='')
371 except RegexNotFoundError
:
375 self
.report_warning('unable to extract title', video_id
=video_id
)
379 # If the video is no longer available, the oldest capture may be one before it was removed.
380 # Setting the capture date in url to early date seems to redirect to earliest capture.
381 webpage
= self
._download
_webpage
(
382 'https://web.archive.org/web/20050214000000/http://www.youtube.com/watch?v=%s' % video_id
,
383 video_id
=video_id
, fatal
=False, errnote
='unable to download video webpage (probably not archived).')
385 title
= _extract_title(webpage
) or title
387 # Use link translator mentioned in https://github.com/ytdl-org/youtube-dl/issues/13655
388 internal_fake_url
= 'https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id
390 video_file_webpage
= self
._request
_webpage
(
391 HEADRequest(internal_fake_url
), video_id
,
392 note
='Fetching video file url', expected_status
=True)
393 except ExtractorError
as e
:
394 # HTTP Error 404 is expected if the video is not saved.
395 if isinstance(e
.cause
, compat_HTTPError
) and e
.cause
.code
== 404:
396 raise ExtractorError(
397 'HTTP Error %s. Most likely the video is not archived or issue with web.archive.org.' % e
.cause
.code
,
400 video_file_url
= compat_urllib_parse_unquote(video_file_webpage
.url
)
401 video_file_url_qs
= parse_qs(video_file_url
)
403 # Attempt to recover any ext & format info from playback url
404 format
= {'url': video_file_url}
405 itag
= try_get(video_file_url_qs
, lambda x
: x
['itag'][0])
406 if itag
and itag
in YoutubeIE
._formats
: # Naughty access but it works
407 format
.update(YoutubeIE
._formats
[itag
])
408 format
.update({'format_id': itag}
)
410 mime
= try_get(video_file_url_qs
, lambda x
: x
['mime'][0])
411 ext
= mimetype2ext(mime
) or determine_ext(video_file_url
)
412 format
.update({'ext': ext}
)
417 'duration': str_to_int(try_get(video_file_url_qs
, lambda x
: x
['dur'][0]))