[yt-dlp.git] / youtube_dl / extractor / archiveorg.py

import json
import re

from .common import InfoExtractor
from ..utils import (
    determine_ext,
    unified_strdate,
)


class ArchiveOrgIE(InfoExtractor):
    IE_NAME = 'archive.org'
    IE_DESC = 'archive.org videos'
    _VALID_URL = r'(?:https?://)?(?:www\.)?archive.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
    _TEST = {
        u"url": u"http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect",
        u'file': u'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv',
        u'md5': u'8af1d4cf447933ed3c7f4871162602db',
        u'info_dict': {
            u"title": u"1968 Demo - FJCC Conference Presentation Reel #1",
            u"description": u"Reel 1 of 3: Also known as the \"Mother of All Demos\", Doug Engelbart's presentation at the Fall Joint Computer Conference in San Francisco, December 9, 1968 titled \"A Research Center for Augmenting Human Intellect.\" For this presentation, Doug and his team astonished the audience by not only relating their research, but demonstrating it live. This was the debut of the mouse, interactive computing, hypermedia, computer supported software engineering, video teleconferencing, etc. See also <a href=\"http://dougengelbart.org/firsts/dougs-1968-demo.html\" rel=\"nofollow\">Doug's 1968 Demo page</a> for more background, highlights, links, and the detailed paper published in this conference proceedings. Filmed on 3 reels: Reel 1 | <a href=\"http://www.archive.org/details/XD300-24_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 2</a> | <a href=\"http://www.archive.org/details/XD300-25_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 3</a>",
            u"upload_date": u"19681210",
            u"uploader": u"SRI International"
        }
    }


    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')

        json_url = url + (u'?' if u'?' in url else '&') + u'output=json'
        json_data = self._download_webpage(json_url, video_id)
        data = json.loads(json_data)

        title = data['metadata']['title'][0]
        description = data['metadata']['description'][0]
        uploader = data['metadata']['creator'][0]
        upload_date = unified_strdate(data['metadata']['date'][0])

        formats = [{
                'format': fdata['format'],
                'url': 'http://' + data['server'] + data['dir'] + fn,
                'file_size': int(fdata['size']),
            }
            for fn,fdata in data['files'].items()
            if 'Video' in fdata['format']]
        formats.sort(key=lambda fdata: fdata['file_size'])
        for f in formats:
            f['ext'] = determine_ext(f['url'])

        info = {
            '_type': 'video',
            'id': video_id,
            'title': title,
            'formats': formats,
            'description': description,
            'uploader': uploader,
            'upload_date': upload_date,
        }
        thumbnail = data.get('misc', {}).get('image')
        if thumbnail:
            info['thumbnail'] = thumbnail

        # TODO: Remove when #980 has been merged
        info.update(formats[-1])

        return info
Commit	Line	Data
5fe3a3c3 PH	1	import json
	2	import re
	3
	4	from .common import InfoExtractor
	5	from ..utils import (
	6	determine_ext,
	7	unified_strdate,
	8	)
	9
	10
	11	class ArchiveOrgIE(InfoExtractor):
	12	IE_NAME = 'archive.org'
	13	IE_DESC = 'archive.org videos'
	14	_VALID_URL = r'(?:https?://)?(?:www\.)?archive.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
	15	_TEST = {
	16	u"url": u"http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect",
	17	u'file': u'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv',
	18	u'md5': u'8af1d4cf447933ed3c7f4871162602db',
	19	u'info_dict': {
	20	u"title": u"1968 Demo - FJCC Conference Presentation Reel #1",
	21	u"description": u"Reel 1 of 3: Also known as the \"Mother of All Demos\", Doug Engelbart's presentation at the Fall Joint Computer Conference in San Francisco, December 9, 1968 titled \"A Research Center for Augmenting Human Intellect.\" For this presentation, Doug and his team astonished the audience by not only relating their research, but demonstrating it live. This was the debut of the mouse, interactive computing, hypermedia, computer supported software engineering, video teleconferencing, etc. See also <a href=\"http://dougengelbart.org/firsts/dougs-1968-demo.html\" rel=\"nofollow\">Doug's 1968 Demo page</a> for more background, highlights, links, and the detailed paper published in this conference proceedings. Filmed on 3 reels: Reel 1 \| <a href=\"http://www.archive.org/details/XD300-24_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 2</a> \| <a href=\"http://www.archive.org/details/XD300-25_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 3</a>",
	22	u"upload_date": u"19681210",
	23	u"uploader": u"SRI International"
	24	}
	25	}
	26
	27
	28	def _real_extract(self, url):
	29	mobj = re.match(self._VALID_URL, url)
	30	video_id = mobj.group('id')
	31
	32	json_url = url + (u'?' if u'?' in url else '&') + u'output=json'
	33	json_data = self._download_webpage(json_url, video_id)
	34	data = json.loads(json_data)
	35
	36	title = data['metadata']['title'][0]
	37	description = data['metadata']['description'][0]
	38	uploader = data['metadata']['creator'][0]
	39	upload_date = unified_strdate(data['metadata']['date'][0])
	40
	41	formats = [{
	42	'format': fdata['format'],
	43	'url': 'http://' + data['server'] + data['dir'] + fn,
	44	'file_size': int(fdata['size']),
	45	}
	46	for fn,fdata in data['files'].items()
	47	if 'Video' in fdata['format']]
	48	formats.sort(key=lambda fdata: fdata['file_size'])
471a5ee9 JMF	49	for f in formats:
471a5ee9 JMF	50	f['ext'] = determine_ext(f['url'])
5fe3a3c3 PH	51
5fe3a3c3 PH	52	info = {
690e872c	53	'_type': 'video',
5fe3a3c3 PH	54	'id': video_id,
	55	'title': title,
	56	'formats': formats,
	57	'description': description,
	58	'uploader': uploader,
	59	'upload_date': upload_date,
	60	}
	61	thumbnail = data.get('misc', {}).get('image')
	62	if thumbnail:
	63	info['thumbnail'] = thumbnail
	64
	65	# TODO: Remove when #980 has been merged
471a5ee9	66	info.update(formats[-1])
5fe3a3c3	67
471a5ee9	68	return info