jfr.im git - yt-dlp.git/blame_incremental - yt_dlp/extractor/libraryofcongress.py

... / ...

Commit	Line	Data
	1	import re
	2
	3	from .common import InfoExtractor
	4
	5	from ..utils import (
	6	determine_ext,
	7	float_or_none,
	8	int_or_none,
	9	parse_filesize,
	10	)
	11
	12
	13	class LibraryOfCongressIE(InfoExtractor):
	14	IE_NAME = 'loc'
	15	IE_DESC = 'Library of Congress'
	16	_VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/\|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P<id>[0-9a-z_.]+)'
	17	_TESTS = [{
	18	# embedded via <div class="media-player"
	19	'url': 'http://loc.gov/item/90716351/',
	20	'md5': '6ec0ae8f07f86731b1b2ff70f046210a',
	21	'info_dict': {
	22	'id': '90716351',
	23	'ext': 'mp4',
	24	'title': "Pa's trip to Mars",
	25	'duration': 0,
	26	'view_count': int,
	27	},
	28	}, {
	29	# webcast embedded via mediaObjectId
	30	'url': 'https://www.loc.gov/today/cyberlc/feature_wdesc.php?rec=5578',
	31	'info_dict': {
	32	'id': '5578',
	33	'ext': 'mp4',
	34	'title': 'Help! Preservation Training Needs Here, There & Everywhere',
	35	'duration': 3765,
	36	'view_count': int,
	37	'subtitles': 'mincount:1',
	38	},
	39	'params': {
	40	'skip_download': True,
	41	},
	42	}, {
	43	# with direct download links
	44	'url': 'https://www.loc.gov/item/78710669/',
	45	'info_dict': {
	46	'id': '78710669',
	47	'ext': 'mp4',
	48	'title': 'La vie et la passion de Jesus-Christ',
	49	'duration': 0,
	50	'view_count': int,
	51	'formats': 'mincount:4',
	52	},
	53	'params': {
	54	'skip_download': True,
	55	},
	56	}, {
	57	'url': 'https://www.loc.gov/item/ihas.200197114/',
	58	'only_matching': True,
	59	}, {
	60	'url': 'https://www.loc.gov/item/afc1981005_afs20503/',
	61	'only_matching': True,
	62	}]
	63
	64	def _real_extract(self, url):
	65	video_id = self._match_id(url)
	66	webpage = self._download_webpage(url, video_id)
	67
	68	media_id = self._search_regex(
	69	(r'id=(["\'])media-player-(?P<id>.+?)\1',
	70	r'<video[^>]+id=(["\'])uuid-(?P<id>.+?)\1',
	71	r'<video[^>]+data-uuid=(["\'])(?P<id>.+?)\1',
	72	r'mediaObjectId\s:\s(["\'])(?P<id>.+?)\1',
	73	r'data-tab="share-media-(?P<id>[0-9A-F]{32})"'),
	74	webpage, 'media id', group='id')
	75
	76	data = self._download_json(
	77	'https://media.loc.gov/services/v1/media?id=%s&context=json' % media_id,
	78	media_id)['mediaObject']
	79
	80	derivative = data['derivatives'][0]
	81	media_url = derivative['derivativeUrl']
	82
	83	title = derivative.get('shortName') or data.get('shortName') or self._og_search_title(
	84	webpage)
	85
	86	# Following algorithm was extracted from setAVSource js function
	87	# found in webpage
	88	media_url = media_url.replace('rtmp', 'https')
	89
	90	is_video = data.get('mediaType', 'v').lower() == 'v'
	91	ext = determine_ext(media_url)
	92	if ext not in ('mp4', 'mp3'):
	93	media_url += '.mp4' if is_video else '.mp3'
	94
	95	formats = []
	96	if '/vod/mp4:' in media_url:
	97	formats.append({
	98	'url': media_url.replace('/vod/mp4:', '/hls-vod/media/') + '.m3u8',
	99	'format_id': 'hls',
	100	'ext': 'mp4',
	101	'protocol': 'm3u8_native',
	102	'quality': 1,
	103	})
	104	http_format = {
	105	'url': re.sub(r'(://[^/]+/)(?:[^/]+/)*(?:mp4\|mp3):', r'\1', media_url),
	106	'format_id': 'http',
	107	'quality': 1,
	108	}
	109	if not is_video:
	110	http_format['vcodec'] = 'none'
	111	formats.append(http_format)
	112
	113	download_urls = set()
	114	for m in re.finditer(
	115	r'<option[^>]+value=(["\'])(?P<url>.+?)\1[^>]+data-file-download=[^>]+>\s(?P<id>.+?)(?:(?: \|\s+)\((?P<size>.+?)\))?\s<', webpage):
	116	format_id = m.group('id').lower()
	117	if format_id in ('gif', 'jpeg'):
	118	continue
	119	download_url = m.group('url')
	120	if download_url in download_urls:
	121	continue
	122	download_urls.add(download_url)
	123	formats.append({
	124	'url': download_url,
	125	'format_id': format_id,
	126	'filesize_approx': parse_filesize(m.group('size')),
	127	})
	128
	129	duration = float_or_none(data.get('duration'))
	130	view_count = int_or_none(data.get('viewCount'))
	131
	132	subtitles = {}
	133	cc_url = data.get('ccUrl')
	134	if cc_url:
	135	subtitles.setdefault('en', []).append({
	136	'url': cc_url,
	137	'ext': 'ttml',
	138	})
	139
	140	return {
	141	'id': video_id,
	142	'title': title,
	143	'thumbnail': self._og_search_thumbnail(webpage, default=None),
	144	'duration': duration,
	145	'view_count': view_count,
	146	'formats': formats,
	147	'subtitles': subtitles,
	148	}