]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/archiveorg.py
[downloaders] Fix API access
[yt-dlp.git] / yt_dlp / extractor / archiveorg.py
CommitLineData
3798eadc
PH
1from __future__ import unicode_literals
2
a3e26449 3import re
4import json
5
a4a554a7 6from .common import InfoExtractor
a3e26449 7from ..compat import compat_urllib_parse_unquote_plus
d50aca41 8from ..utils import (
a3e26449 9 KNOWN_EXTENSIONS,
10
11 extract_attributes,
d50aca41 12 unified_strdate,
a3e26449 13 unified_timestamp,
d50aca41 14 clean_html,
a3e26449 15 dict_get,
16 parse_duration,
17 int_or_none,
18 str_or_none,
19 merge_dicts,
d50aca41 20)
5fe3a3c3
PH
21
22
a4a554a7 23class ArchiveOrgIE(InfoExtractor):
5fe3a3c3 24 IE_NAME = 'archive.org'
a3e26449 25 IE_DESC = 'archive.org video and audio'
26 _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^?#]+)(?:[?].*)?$'
e8e28989
S
27 _TESTS = [{
28 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
3798eadc
PH
29 'md5': '8af1d4cf447933ed3c7f4871162602db',
30 'info_dict': {
e8e28989 31 'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect',
a3e26449 32 'ext': 'ogv',
e8e28989 33 'title': '1968 Demo - FJCC Conference Presentation Reel #1',
d50aca41 34 'description': 'md5:da45c349df039f1cc8075268eb1b5c25',
a3e26449 35 'release_date': '19681210',
36 'timestamp': 1268695290,
37 'upload_date': '20100315',
38 'creator': 'SRI International',
39 'uploader': 'laura@archive.org',
40 },
e8e28989
S
41 }, {
42 'url': 'https://archive.org/details/Cops1922',
c12b4b80 43 'md5': '0869000b4ce265e8ca62738b336b268a',
e8e28989
S
44 'info_dict': {
45 'id': 'Cops1922',
d50aca41 46 'ext': 'mp4',
e8e28989 47 'title': 'Buster Keaton\'s "Cops" (1922)',
a3e26449 48 'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c',
49 'uploader': 'yorkmba99@hotmail.com',
50 'timestamp': 1387699629,
51 'upload_date': "20131222",
52 },
d50aca41
RA
53 }, {
54 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
55 'only_matching': True,
a3e26449 56 }, {
57 'url': 'https://archive.org/details/Election_Ads',
58 'md5': '284180e857160cf866358700bab668a3',
59 'info_dict': {
60 'id': 'Election_Ads/Commercial-JFK1960ElectionAdCampaignJingle.mpg',
61 'title': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg',
62 'ext': 'mp4',
63 },
64 }, {
65 'url': 'https://archive.org/details/Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
66 'md5': '7915213ef02559b5501fe630e1a53f59',
67 'info_dict': {
68 'id': 'Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
69 'title': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg',
70 'ext': 'mp4',
71 'timestamp': 1205588045,
72 'uploader': 'mikedavisstripmaster@yahoo.com',
73 'description': '1960 Presidential Campaign Election Commercials John F Kennedy, Richard M Nixon',
74 'upload_date': '20080315',
75 },
76 }, {
77 'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16',
78 'md5': '7d07ffb42aba6537c28e053efa4b54c9',
79 'info_dict': {
80 'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t01.flac',
81 'title': 'Turning',
82 'ext': 'flac',
83 },
84 }, {
85 'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',
86 'md5': 'a07cd8c6ab4ee1560f8a0021717130f3',
87 'info_dict': {
88 'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',
89 'title': 'Deal',
90 'ext': 'flac',
91 'timestamp': 1205895624,
92 'uploader': 'mvernon54@yahoo.com',
93 'description': 'md5:6a31f1996db0aa0fc9da6d6e708a1bb0',
94 'upload_date': '20080319',
95 'location': 'Barton Hall - Cornell University',
96 },
97 }, {
98 'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik',
99 'md5': '7cb019baa9b332e82ea7c10403acd180',
100 'info_dict': {
101 'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/01.01. Bells Of Rostov.mp3',
102 'title': 'Bells Of Rostov',
103 'ext': 'mp3',
104 },
105 }, {
106 'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02.+Song+And+Chorus+In+The+Polovetsian+Camp+From+%22Prince+Igor%22+(Act+2%2C+Scene+1).mp3',
107 'md5': '1d0aabe03edca83ca58d9ed3b493a3c3',
108 'info_dict': {
109 'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02. Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1).mp3',
110 'title': 'Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1)',
111 'ext': 'mp3',
112 'timestamp': 1569662587,
113 'uploader': 'associate-joygen-odiongan@archive.org',
114 'description': 'md5:012b2d668ae753be36896f343d12a236',
115 'upload_date': '20190928',
116 },
e8e28989 117 }]
ff7a07d5 118
a3e26449 119 @staticmethod
120 def _playlist_data(webpage):
121 element = re.findall(r'''(?xs)
122 <input
123 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
124 \s+class=['"]?js-play8-playlist['"]?
125 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
126 \s*/>
127 ''', webpage)[0]
128
129 return json.loads(extract_attributes(element)['value'])
130
5fe3a3c3 131 def _real_extract(self, url):
a3e26449 132 video_id = compat_urllib_parse_unquote_plus(self._match_id(url))
133 identifier, entry_id = (video_id.split('/', 1) + [None])[:2]
134
135 # Archive.org metadata API doesn't clearly demarcate playlist entries
136 # or subtitle tracks, so we get them from the embeddable player.
137 embed_page = self._download_webpage(
138 'https://archive.org/embed/' + identifier, identifier)
139 playlist = self._playlist_data(embed_page)
140
141 entries = {}
142 for p in playlist:
143 # If the user specified a playlist entry in the URL, ignore the
144 # rest of the playlist.
145 if entry_id and p['orig'] != entry_id:
146 continue
147
148 entries[p['orig']] = {
149 'formats': [],
150 'thumbnails': [],
151 'artist': p.get('artist'),
152 'track': p.get('title'),
153 'subtitles': {}}
154
155 for track in p.get('tracks', []):
156 if track['kind'] != 'subtitles':
157 continue
158
159 entries[p['orig']][track['label']] = {
160 'url': 'https://archive.org/' + track['file'].lstrip('/')}
5fe3a3c3 161
d50aca41 162 metadata = self._download_json(
a3e26449 163 'http://archive.org/metadata/' + identifier, identifier)
164 m = metadata['metadata']
165 identifier = m['identifier']
166
167 info = {
168 'id': identifier,
169 'title': m['title'],
170 'description': clean_html(m.get('description')),
171 'uploader': dict_get(m, ['uploader', 'adder']),
172 'creator': m.get('creator'),
173 'license': m.get('licenseurl'),
174 'release_date': unified_strdate(m.get('date')),
175 'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])),
176 'webpage_url': 'https://archive.org/details/' + identifier,
177 'location': m.get('venue'),
178 'release_year': int_or_none(m.get('year'))}
179
180 for f in metadata['files']:
181 if f['name'] in entries:
182 entries[f['name']] = merge_dicts(entries[f['name']], {
183 'id': identifier + '/' + f['name'],
184 'title': f.get('title') or f['name'],
185 'display_id': f['name'],
186 'description': clean_html(f.get('description')),
187 'creator': f.get('creator'),
188 'duration': parse_duration(f.get('length')),
189 'track_number': int_or_none(f.get('track')),
190 'album': f.get('album'),
191 'discnumber': int_or_none(f.get('disc')),
192 'release_year': int_or_none(f.get('year'))})
193 entry = entries[f['name']]
194 elif f.get('original') in entries:
195 entry = entries[f['original']]
196 else:
197 continue
198
199 if f.get('format') == 'Thumbnail':
200 entry['thumbnails'].append({
201 'id': f['name'],
202 'url': 'https://archive.org/download/' + identifier + '/' + f['name'],
203 'width': int_or_none(f.get('width')),
204 'height': int_or_none(f.get('width')),
205 'filesize': int_or_none(f.get('size'))})
206
207 extension = (f['name'].rsplit('.', 1) + [None])[1]
208 if extension in KNOWN_EXTENSIONS:
209 entry['formats'].append({
210 'url': 'https://archive.org/download/' + identifier + '/' + f['name'],
211 'format': f.get('format'),
212 'width': int_or_none(f.get('width')),
213 'height': int_or_none(f.get('height')),
214 'filesize': int_or_none(f.get('size')),
215 'protocol': 'https'})
216
217 # Sort available formats by filesize
218 for entry in entries.values():
219 entry['formats'] = list(sorted(entry['formats'], key=lambda x: x.get('filesize', -1)))
220
221 if len(entries) == 1:
222 # If there's only one item, use it as the main info dict
223 only_video = entries[list(entries.keys())[0]]
224 if entry_id:
225 info = merge_dicts(only_video, info)
226 else:
227 info = merge_dicts(info, only_video)
228 else:
229 # Otherwise, we have a playlist.
230 info['_type'] = 'playlist'
231 info['entries'] = list(entries.values())
232
233 if metadata.get('reviews'):
234 info['comments'] = []
235 for review in metadata['reviews']:
236 info['comments'].append({
237 'id': review.get('review_id'),
238 'author': review.get('reviewer'),
239 'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'),
240 'timestamp': unified_timestamp(review.get('createdate')),
241 'parent': 'root'})
242
84bc23b4 243 return info