jfr.im git - yt-dlp.git/blame_incremental - yt

... / ...

Commit	Line	Data
	1	import itertools
	2
	3	from .common import InfoExtractor
	4	from ..utils import (
	5	clean_html,
	6	extract_attributes,
	7	get_element_by_class,
	8	get_element_html_by_class,
	9	get_elements_html_by_class,
	10	parse_qs,
	11	traverse_obj,
	12	unified_strdate,
	13	urljoin
	14	)
	15
	16
	17	class TheGuardianPodcastIE(InfoExtractor):
	18	_VALID_URL = r'https?://(?:www\.)?theguardian\.com/\w+/audio/\d{4}/\w{3}/\d{1,2}/(?P<id>[\w-]+)'
	19	_TESTS = [{
	20	'url': 'https://www.theguardian.com/news/audio/2023/nov/03/we-are-just-getting-started-the-plastic-eating-bacteria-that-could-change-the-world-podcast',
	21	'md5': 'd1771744681789b4cd7da2a08e487702',
	22	'info_dict': {
	23	'id': 'we-are-just-getting-started-the-plastic-eating-bacteria-that-could-change-the-world-podcast',
	24	'ext': 'mp3',
	25	'title': '‘We are just getting started’: the plastic-eating bacteria that could change the world – podcast',
	26	'description': 'md5:cfd3df2791d394d2ab62cd571d5207ee',
	27	'creator': 'Stephen Buranyi',
	28	'thumbnail': 'md5:73c12558fcb3b0e2a59422bfb33b3f79',
	29	'release_date': '20231103'
	30	}
	31	}, {
	32	'url': 'https://www.theguardian.com/news/audio/2023/oct/30/the-trials-of-robert-habeck-is-the-worlds-most-powerful-green-politician-doomed-to-fail-podcast',
	33	'md5': 'd1771744681789b4cd7da2a08e487702',
	34	'info_dict': {
	35	'id': 'the-trials-of-robert-habeck-is-the-worlds-most-powerful-green-politician-doomed-to-fail-podcast',
	36	'ext': 'mp3',
	37	'title': 'The trials of Robert Habeck: is the world’s most powerful green politician doomed to fail? – podcast',
	38	'description': 'md5:1b5cf6582d1771c6b7077784b5456994',
	39	'creator': 'Philip Oltermann',
	40	'thumbnail': 'md5:6e5c5ec43843e956e20be793722e9080',
	41	'release_date': '20231030'
	42	}
	43	}, {
	44	'url': 'https://www.theguardian.com/football/audio/2023/nov/06/arsenal-feel-hard-done-by-and-luton-hold-liverpool-football-weekly',
	45	'md5': 'a2fcff6f8e060a95b1483295273dc35e',
	46	'info_dict': {
	47	'id': 'arsenal-feel-hard-done-by-and-luton-hold-liverpool-football-weekly',
	48	'ext': 'mp3',
	49	'title': 'Arsenal feel hard done by and Luton hold Liverpool – Football Weekly',
	50	'description': 'md5:286a9fbddaeb7c83cc65d1c4a5330b2a',
	51	'creator': 'Max Rushden',
	52	'thumbnail': 'md5:93eb7d6440f1bb94eb3a6cad63f48afd',
	53	'release_date': '20231106'
	54	}
	55	}, {
	56	'url': 'https://www.theguardian.com/politics/audio/2023/nov/02/the-covid-inquiry-politics-weekly-uk-podcast',
	57	'md5': '06a0f7e9701a80c8064a5d35690481ec',
	58	'info_dict': {
	59	'id': 'the-covid-inquiry-politics-weekly-uk-podcast',
	60	'ext': 'mp3',
	61	'title': 'The Covid inquiry \| Politics Weekly UK - podcast',
	62	'description': 'md5:207c98859c14903582b17d25b014046e',
	63	'creator': 'Gaby Hinsliff',
	64	'thumbnail': 'md5:28932a7b5a25b057be330d2ed70ea7f3',
	65	'release_date': '20231102'
	66	}
	67	}]
	68
	69	def _real_extract(self, url):
	70	video_id = self._match_id(url)
	71	webpage = self._download_webpage(url, video_id)
	72	return {
	73	'id': video_id,
	74	'title': self._og_search_title(webpage) or get_element_by_class('content__headline', webpage),
	75	'description': self._og_search_description(webpage),
	76	'creator': self._html_search_meta('author', webpage),
	77	'thumbnail': self._og_search_thumbnail(webpage),
	78	'release_date': unified_strdate(self._html_search_meta('article:published_time', webpage)),
	79	'url': extract_attributes(get_element_html_by_class(
	80	'podcast__player', webpage) or '').get('data-source'),
	81	}
	82
	83
	84	class TheGuardianPodcastPlaylistIE(InfoExtractor):
	85	_VALID_URL = r'https?://(?:www\.)?theguardian\.com/\w+/series/(?P<id>[\w-]+)(?:\?page=\d+)?'
	86	_TESTS = [{
	87	'url': 'https://www.theguardian.com/football/series/theguardianswomensfootballweekly',
	88	'info_dict': {
	89	'id': 'theguardianswomensfootballweekly',
	90	'title': "The Guardian's Women's Football Weekly",
	91	'description': 'md5:e2cc021311e582d29935a73614a43f51'
	92	},
	93	'playlist_mincount': 69
	94	}, {
	95	'url': 'https://www.theguardian.com/news/series/todayinfocus?page=2',
	96	'info_dict': {
	97	'id': 'todayinfocus',
	98	'title': 'Today in Focus',
	99	'description': 'md5:0f097764fc0d359e0b6eb537be0387e2'
	100	},
	101	'playlist_mincount': 1261
	102	}, {
	103	'url': 'https://www.theguardian.com/news/series/the-audio-long-read',
	104	'info_dict': {
	105	'id': 'the-audio-long-read',
	106	'title': 'The Audio Long Read',
	107	'description': 'md5:5462994a27527309562b25b6defc4ef3'
	108	},
	109	'playlist_mincount': 996
	110	}]
	111
	112	def _entries(self, url, playlist_id):
	113	for page in itertools.count(1):
	114	webpage, urlh = self._download_webpage_handle(
	115	url, playlist_id, f'Downloading page {page}', query={'page': page})
	116	if 'page' not in parse_qs(urlh.url):
	117	break
	118
	119	episodes = get_elements_html_by_class('fc-item--type-media', webpage)
	120	for url_path in traverse_obj(episodes, (..., {extract_attributes}, 'data-id')):
	121	yield url_path
	122
	123	def _real_extract(self, url):
	124	podcast_id = self._match_id(url)
	125
	126	webpage = self._download_webpage(url, podcast_id)
	127
	128	title = clean_html(get_element_by_class(
	129	'index-page-header__title', webpage) or get_element_by_class('flagship-audio__title', webpage))
	130	description = self._og_search_description(webpage) or self._html_search_meta(
	131	'description', webpage)
	132
	133	return self.playlist_from_matches(
	134	self._entries(url, podcast_id), podcast_id, title, description=description,
	135	ie=TheGuardianPodcastIE, getter=lambda x: urljoin('https://www.theguardian.com', x))