]>
Commit | Line | Data |
---|---|---|
3798eadc PH |
1 | from __future__ import unicode_literals |
2 | ||
a4a554a7 | 3 | from .common import InfoExtractor |
d50aca41 RA |
4 | from ..utils import ( |
5 | unified_strdate, | |
6 | clean_html, | |
7 | ) | |
5fe3a3c3 PH |
8 | |
9 | ||
a4a554a7 | 10 | class ArchiveOrgIE(InfoExtractor): |
5fe3a3c3 PH |
11 | IE_NAME = 'archive.org' |
12 | IE_DESC = 'archive.org videos' | |
d50aca41 | 13 | _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#]+)(?:[?].*)?$' |
e8e28989 S |
14 | _TESTS = [{ |
15 | 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect', | |
3798eadc PH |
16 | 'md5': '8af1d4cf447933ed3c7f4871162602db', |
17 | 'info_dict': { | |
e8e28989 | 18 | 'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect', |
d50aca41 | 19 | 'ext': 'ogg', |
e8e28989 | 20 | 'title': '1968 Demo - FJCC Conference Presentation Reel #1', |
d50aca41 | 21 | 'description': 'md5:da45c349df039f1cc8075268eb1b5c25', |
e8e28989 S |
22 | 'upload_date': '19681210', |
23 | 'uploader': 'SRI International' | |
5fe3a3c3 | 24 | } |
e8e28989 S |
25 | }, { |
26 | 'url': 'https://archive.org/details/Cops1922', | |
d50aca41 | 27 | 'md5': 'bc73c8ab3838b5a8fc6c6651fa7b58ba', |
e8e28989 S |
28 | 'info_dict': { |
29 | 'id': 'Cops1922', | |
d50aca41 | 30 | 'ext': 'mp4', |
e8e28989 | 31 | 'title': 'Buster Keaton\'s "Cops" (1922)', |
d50aca41 | 32 | 'description': 'md5:b4544662605877edd99df22f9620d858', |
e8e28989 | 33 | } |
d50aca41 RA |
34 | }, { |
35 | 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', | |
36 | 'only_matching': True, | |
e8e28989 | 37 | }] |
ff7a07d5 | 38 | |
5fe3a3c3 | 39 | def _real_extract(self, url): |
e8e28989 | 40 | video_id = self._match_id(url) |
d50aca41 RA |
41 | webpage = self._download_webpage( |
42 | 'http://archive.org/embed/' + video_id, video_id) | |
43 | jwplayer_playlist = self._parse_json(self._search_regex( | |
44 | r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\);", | |
45 | webpage, 'jwplayer playlist'), video_id) | |
46 | info = self._parse_jwplayer_data( | |
47 | {'playlist': jwplayer_playlist}, video_id, base_url=url) | |
5fe3a3c3 | 48 | |
d50aca41 RA |
49 | def get_optional(metadata, field): |
50 | return metadata.get(field, [None])[0] | |
5fe3a3c3 | 51 | |
d50aca41 RA |
52 | metadata = self._download_json( |
53 | 'http://archive.org/details/' + video_id, video_id, query={ | |
54 | 'output': 'json', | |
55 | })['metadata'] | |
56 | info.update({ | |
57 | 'title': get_optional(metadata, 'title') or info.get('title'), | |
58 | 'description': clean_html(get_optional(metadata, 'description')), | |
59 | }) | |
60 | if info.get('_type') != 'playlist': | |
61 | info.update({ | |
62 | 'uploader': get_optional(metadata, 'creator'), | |
63 | 'upload_date': unified_strdate(get_optional(metadata, 'date')), | |
64 | }) | |
84bc23b4 | 65 | return info |