[yt-dlp.git] / yt_dlp / extractor / archiveorg.py

from __future__ import unicode_literals

import re
import json

from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote_plus
from ..utils import (
    KNOWN_EXTENSIONS,

    extract_attributes,
    unified_strdate,
    unified_timestamp,
    clean_html,
    dict_get,
    parse_duration,
    int_or_none,
    str_or_none,
    merge_dicts,
)


class ArchiveOrgIE(InfoExtractor):
    IE_NAME = 'archive.org'
    IE_DESC = 'archive.org video and audio'
    _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^?#]+)(?:[?].*)?$'
    _TESTS = [{
        'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
        'md5': '8af1d4cf447933ed3c7f4871162602db',
        'info_dict': {
            'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect',
            'ext': 'ogv',
            'title': '1968 Demo - FJCC Conference Presentation Reel #1',
            'description': 'md5:da45c349df039f1cc8075268eb1b5c25',
            'release_date': '19681210',
            'timestamp': 1268695290,
            'upload_date': '20100315',
            'creator': 'SRI International',
            'uploader': 'laura@archive.org',
        },
    }, {
        'url': 'https://archive.org/details/Cops1922',
        'md5': '0869000b4ce265e8ca62738b336b268a',
        'info_dict': {
            'id': 'Cops1922',
            'ext': 'mp4',
            'title': 'Buster Keaton\'s "Cops" (1922)',
            'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c',
            'uploader': 'yorkmba99@hotmail.com',
            'timestamp': 1387699629,
            'upload_date': "20131222",
        },
    }, {
        'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
        'only_matching': True,
    }, {
        'url': 'https://archive.org/details/Election_Ads',
        'md5': '284180e857160cf866358700bab668a3',
        'info_dict': {
            'id': 'Election_Ads/Commercial-JFK1960ElectionAdCampaignJingle.mpg',
            'title': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg',
            'ext': 'mp4',
        },
    }, {
        'url': 'https://archive.org/details/Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
        'md5': '7915213ef02559b5501fe630e1a53f59',
        'info_dict': {
            'id': 'Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
            'title': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg',
            'ext': 'mp4',
            'timestamp': 1205588045,
            'uploader': 'mikedavisstripmaster@yahoo.com',
            'description': '1960 Presidential Campaign Election Commercials John F Kennedy, Richard M Nixon',
            'upload_date': '20080315',
        },
    }, {
        'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16',
        'md5': '7d07ffb42aba6537c28e053efa4b54c9',
        'info_dict': {
            'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t01.flac',
            'title': 'Turning',
            'ext': 'flac',
        },
    }, {
        'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',
        'md5': 'a07cd8c6ab4ee1560f8a0021717130f3',
        'info_dict': {
            'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',
            'title': 'Deal',
            'ext': 'flac',
            'timestamp': 1205895624,
            'uploader': 'mvernon54@yahoo.com',
            'description': 'md5:6a31f1996db0aa0fc9da6d6e708a1bb0',
            'upload_date': '20080319',
            'location': 'Barton Hall - Cornell University',
        },
    }, {
        'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik',
        'md5': '7cb019baa9b332e82ea7c10403acd180',
        'info_dict': {
            'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/01.01. Bells Of Rostov.mp3',
            'title': 'Bells Of Rostov',
            'ext': 'mp3',
        },
    }, {
        'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02.+Song+And+Chorus+In+The+Polovetsian+Camp+From+%22Prince+Igor%22+(Act+2%2C+Scene+1).mp3',
        'md5': '1d0aabe03edca83ca58d9ed3b493a3c3',
        'info_dict': {
            'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02. Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1).mp3',
            'title': 'Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1)',
            'ext': 'mp3',
            'timestamp': 1569662587,
            'uploader': 'associate-joygen-odiongan@archive.org',
            'description': 'md5:012b2d668ae753be36896f343d12a236',
            'upload_date': '20190928',
        },
    }]

    @staticmethod
    def _playlist_data(webpage):
        element = re.findall(r'''(?xs)
            <input
            (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
            \s+class=['"]?js-play8-playlist['"]?
            (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
            \s*/>
        ''', webpage)[0]

        return json.loads(extract_attributes(element)['value'])

    def _real_extract(self, url):
        video_id = compat_urllib_parse_unquote_plus(self._match_id(url))
        identifier, entry_id = (video_id.split('/', 1) + [None])[:2]

        # Archive.org metadata API doesn't clearly demarcate playlist entries
        # or subtitle tracks, so we get them from the embeddable player.
        embed_page = self._download_webpage(
            'https://archive.org/embed/' + identifier, identifier)
        playlist = self._playlist_data(embed_page)

        entries = {}
        for p in playlist:
            # If the user specified a playlist entry in the URL, ignore the
            # rest of the playlist.
            if entry_id and p['orig'] != entry_id:
                continue

            entries[p['orig']] = {
                'formats': [],
                'thumbnails': [],
                'artist': p.get('artist'),
                'track': p.get('title'),
                'subtitles': {}}

            for track in p.get('tracks', []):
                if track['kind'] != 'subtitles':
                    continue

                entries[p['orig']][track['label']] = {
                    'url': 'https://archive.org/' + track['file'].lstrip('/')}

        metadata = self._download_json(
            'http://archive.org/metadata/' + identifier, identifier)
        m = metadata['metadata']
        identifier = m['identifier']

        info = {
            'id': identifier,
            'title': m['title'],
            'description': clean_html(m.get('description')),
            'uploader': dict_get(m, ['uploader', 'adder']),
            'creator': m.get('creator'),
            'license': m.get('licenseurl'),
            'release_date': unified_strdate(m.get('date')),
            'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])),
            'webpage_url': 'https://archive.org/details/' + identifier,
            'location': m.get('venue'),
            'release_year': int_or_none(m.get('year'))}

        for f in metadata['files']:
            if f['name'] in entries:
                entries[f['name']] = merge_dicts(entries[f['name']], {
                    'id': identifier + '/' + f['name'],
                    'title': f.get('title') or f['name'],
                    'display_id': f['name'],
                    'description': clean_html(f.get('description')),
                    'creator': f.get('creator'),
                    'duration': parse_duration(f.get('length')),
                    'track_number': int_or_none(f.get('track')),
                    'album': f.get('album'),
                    'discnumber': int_or_none(f.get('disc')),
                    'release_year': int_or_none(f.get('year'))})
                entry = entries[f['name']]
            elif f.get('original') in entries:
                entry = entries[f['original']]
            else:
                continue

            if f.get('format') == 'Thumbnail':
                entry['thumbnails'].append({
                    'id': f['name'],
                    'url': 'https://archive.org/download/' + identifier + '/' + f['name'],
                    'width': int_or_none(f.get('width')),
                    'height': int_or_none(f.get('width')),
                    'filesize': int_or_none(f.get('size'))})

            extension = (f['name'].rsplit('.', 1) + [None])[1]
            if extension in KNOWN_EXTENSIONS:
                entry['formats'].append({
                    'url': 'https://archive.org/download/' + identifier + '/' + f['name'],
                    'format': f.get('format'),
                    'width': int_or_none(f.get('width')),
                    'height': int_or_none(f.get('height')),
                    'filesize': int_or_none(f.get('size')),
                    'protocol': 'https'})

        # Sort available formats by filesize
        for entry in entries.values():
            entry['formats'] = list(sorted(entry['formats'], key=lambda x: x.get('filesize', -1)))

        if len(entries) == 1:
            # If there's only one item, use it as the main info dict
            only_video = entries[list(entries.keys())[0]]
            if entry_id:
                info = merge_dicts(only_video, info)
            else:
                info = merge_dicts(info, only_video)
        else:
            # Otherwise, we have a playlist.
            info['_type'] = 'playlist'
            info['entries'] = list(entries.values())

        if metadata.get('reviews'):
            info['comments'] = []
            for review in metadata['reviews']:
                info['comments'].append({
                    'id': review.get('review_id'),
                    'author': review.get('reviewer'),
                    'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'),
                    'timestamp': unified_timestamp(review.get('createdate')),
                    'parent': 'root'})

        return info
Commit	Line	Data
3798eadc PH	1	from __future__ import unicode_literals
3798eadc PH	2
a3e26449	3	import re
	4	import json
	5
a4a554a7	6	from .common import InfoExtractor
a3e26449	7	from ..compat import compat_urllib_parse_unquote_plus
d50aca41	8	from ..utils import (
a3e26449	9	KNOWN_EXTENSIONS,
	10
	11	extract_attributes,
d50aca41	12	unified_strdate,
a3e26449	13	unified_timestamp,
d50aca41	14	clean_html,
a3e26449	15	dict_get,
	16	parse_duration,
	17	int_or_none,
	18	str_or_none,
	19	merge_dicts,
d50aca41	20	)
5fe3a3c3 PH	21
5fe3a3c3 PH	22
a4a554a7	23	class ArchiveOrgIE(InfoExtractor):
5fe3a3c3	24	IE_NAME = 'archive.org'
a3e26449	25	IE_DESC = 'archive.org video and audio'
a3e26449	26	_VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details\|embed)/(?P<id>[^?#]+)(?:[?].*)?$'
e8e28989 S	27	_TESTS = [{
e8e28989 S	28	'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
3798eadc PH	29	'md5': '8af1d4cf447933ed3c7f4871162602db',
3798eadc PH	30	'info_dict': {
e8e28989	31	'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect',
a3e26449	32	'ext': 'ogv',
e8e28989	33	'title': '1968 Demo - FJCC Conference Presentation Reel #1',
d50aca41	34	'description': 'md5:da45c349df039f1cc8075268eb1b5c25',
a3e26449	35	'release_date': '19681210',
	36	'timestamp': 1268695290,
	37	'upload_date': '20100315',
	38	'creator': 'SRI International',
	39	'uploader': 'laura@archive.org',
	40	},
e8e28989 S	41	}, {
e8e28989 S	42	'url': 'https://archive.org/details/Cops1922',
c12b4b80	43	'md5': '0869000b4ce265e8ca62738b336b268a',
e8e28989 S	44	'info_dict': {
e8e28989 S	45	'id': 'Cops1922',
d50aca41	46	'ext': 'mp4',
e8e28989	47	'title': 'Buster Keaton\'s "Cops" (1922)',
a3e26449	48	'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c',
	49	'uploader': 'yorkmba99@hotmail.com',
	50	'timestamp': 1387699629,
	51	'upload_date': "20131222",
	52	},
d50aca41 RA	53	}, {
	54	'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
	55	'only_matching': True,
a3e26449	56	}, {
	57	'url': 'https://archive.org/details/Election_Ads',
	58	'md5': '284180e857160cf866358700bab668a3',
	59	'info_dict': {
	60	'id': 'Election_Ads/Commercial-JFK1960ElectionAdCampaignJingle.mpg',
	61	'title': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg',
	62	'ext': 'mp4',
	63	},
	64	}, {
	65	'url': 'https://archive.org/details/Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
	66	'md5': '7915213ef02559b5501fe630e1a53f59',
	67	'info_dict': {
	68	'id': 'Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
	69	'title': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg',
	70	'ext': 'mp4',
	71	'timestamp': 1205588045,
	72	'uploader': 'mikedavisstripmaster@yahoo.com',
	73	'description': '1960 Presidential Campaign Election Commercials John F Kennedy, Richard M Nixon',
	74	'upload_date': '20080315',
	75	},
	76	}, {
	77	'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16',
	78	'md5': '7d07ffb42aba6537c28e053efa4b54c9',
	79	'info_dict': {
	80	'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t01.flac',
	81	'title': 'Turning',
	82	'ext': 'flac',
	83	},
	84	}, {
	85	'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',
	86	'md5': 'a07cd8c6ab4ee1560f8a0021717130f3',
	87	'info_dict': {
	88	'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',
	89	'title': 'Deal',
	90	'ext': 'flac',
	91	'timestamp': 1205895624,
	92	'uploader': 'mvernon54@yahoo.com',
	93	'description': 'md5:6a31f1996db0aa0fc9da6d6e708a1bb0',
	94	'upload_date': '20080319',
	95	'location': 'Barton Hall - Cornell University',
	96	},
	97	}, {
	98	'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik',
	99	'md5': '7cb019baa9b332e82ea7c10403acd180',
	100	'info_dict': {
	101	'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/01.01. Bells Of Rostov.mp3',
	102	'title': 'Bells Of Rostov',
	103	'ext': 'mp3',
	104	},
	105	}, {
	106	'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02.+Song+And+Chorus+In+The+Polovetsian+Camp+From+%22Prince+Igor%22+(Act+2%2C+Scene+1).mp3',
	107	'md5': '1d0aabe03edca83ca58d9ed3b493a3c3',
	108	'info_dict': {
	109	'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02. Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1).mp3',
	110	'title': 'Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1)',
	111	'ext': 'mp3',
	112	'timestamp': 1569662587,
	113	'uploader': 'associate-joygen-odiongan@archive.org',
	114	'description': 'md5:012b2d668ae753be36896f343d12a236',
	115	'upload_date': '20190928',
	116	},
e8e28989	117	}]
ff7a07d5	118
a3e26449	119	@staticmethod
	120	def _playlist_data(webpage):
	121	element = re.findall(r'''(?xs)
	122	<input
	123	(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]\|="[^"]"\|='[^']'\|))?
	124	\s+class=['"]?js-play8-playlist['"]?
	125	(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]\|="[^"]"\|='[^']'\|))?
	126	\s*/>
	127	''', webpage)[0]
	128
	129	return json.loads(extract_attributes(element)['value'])
	130
5fe3a3c3	131	def _real_extract(self, url):
a3e26449	132	video_id = compat_urllib_parse_unquote_plus(self._match_id(url))
	133	identifier, entry_id = (video_id.split('/', 1) + [None])[:2]
	134
	135	# Archive.org metadata API doesn't clearly demarcate playlist entries
	136	# or subtitle tracks, so we get them from the embeddable player.
	137	embed_page = self._download_webpage(
	138	'https://archive.org/embed/' + identifier, identifier)
	139	playlist = self._playlist_data(embed_page)
	140
	141	entries = {}
	142	for p in playlist:
	143	# If the user specified a playlist entry in the URL, ignore the
	144	# rest of the playlist.
	145	if entry_id and p['orig'] != entry_id:
	146	continue
	147
	148	entries[p['orig']] = {
	149	'formats': [],
	150	'thumbnails': [],
	151	'artist': p.get('artist'),
	152	'track': p.get('title'),
	153	'subtitles': {}}
	154
	155	for track in p.get('tracks', []):
	156	if track['kind'] != 'subtitles':
	157	continue
	158
	159	entries[p['orig']][track['label']] = {
	160	'url': 'https://archive.org/' + track['file'].lstrip('/')}
5fe3a3c3	161
d50aca41	162	metadata = self._download_json(
a3e26449	163	'http://archive.org/metadata/' + identifier, identifier)
	164	m = metadata['metadata']
	165	identifier = m['identifier']
	166
	167	info = {
	168	'id': identifier,
	169	'title': m['title'],
	170	'description': clean_html(m.get('description')),
	171	'uploader': dict_get(m, ['uploader', 'adder']),
	172	'creator': m.get('creator'),
	173	'license': m.get('licenseurl'),
	174	'release_date': unified_strdate(m.get('date')),
	175	'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])),
	176	'webpage_url': 'https://archive.org/details/' + identifier,
	177	'location': m.get('venue'),
	178	'release_year': int_or_none(m.get('year'))}
	179
	180	for f in metadata['files']:
	181	if f['name'] in entries:
	182	entries[f['name']] = merge_dicts(entries[f['name']], {
	183	'id': identifier + '/' + f['name'],
	184	'title': f.get('title') or f['name'],
	185	'display_id': f['name'],
	186	'description': clean_html(f.get('description')),
	187	'creator': f.get('creator'),
	188	'duration': parse_duration(f.get('length')),
	189	'track_number': int_or_none(f.get('track')),
	190	'album': f.get('album'),
	191	'discnumber': int_or_none(f.get('disc')),
	192	'release_year': int_or_none(f.get('year'))})
	193	entry = entries[f['name']]
	194	elif f.get('original') in entries:
	195	entry = entries[f['original']]
	196	else:
	197	continue
	198
	199	if f.get('format') == 'Thumbnail':
	200	entry['thumbnails'].append({
	201	'id': f['name'],
	202	'url': 'https://archive.org/download/' + identifier + '/' + f['name'],
	203	'width': int_or_none(f.get('width')),
	204	'height': int_or_none(f.get('width')),
	205	'filesize': int_or_none(f.get('size'))})
	206
	207	extension = (f['name'].rsplit('.', 1) + [None])[1]
	208	if extension in KNOWN_EXTENSIONS:
	209	entry['formats'].append({
	210	'url': 'https://archive.org/download/' + identifier + '/' + f['name'],
	211	'format': f.get('format'),
	212	'width': int_or_none(f.get('width')),
	213	'height': int_or_none(f.get('height')),
	214	'filesize': int_or_none(f.get('size')),
	215	'protocol': 'https'})
	216
	217	# Sort available formats by filesize
	218	for entry in entries.values():
	219	entry['formats'] = list(sorted(entry['formats'], key=lambda x: x.get('filesize', -1)))
	220
	221	if len(entries) == 1:
	222	# If there's only one item, use it as the main info dict
	223	only_video = entries[list(entries.keys())[0]]
	224	if entry_id:
	225	info = merge_dicts(only_video, info)
	226	else:
227	info = merge_dicts(info, only_video)
228	else:
229	# Otherwise, we have a playlist.
230	info['_type'] = 'playlist'
231	info['entries'] = list(entries.values())
232
233	if metadata.get('reviews'):
234	info['comments'] = []
235	for review in metadata['reviews']:
236	info['comments'].append({
237	'id': review.get('review_id'),
238	'author': review.get('reviewer'),
239	'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'),
240	'timestamp': unified_timestamp(review.get('createdate')),
241	'parent': 'root'})
242
84bc23b4	243	return info