jfr.im git - yt-dlp.git/blame_incremental - yt

Commit	Line	Data
	1	from .common import InfoExtractor
	2	from ..utils import parse_duration, parse_iso8601, traverse_obj
	3
	4
	5	class NOSNLArticleIE(InfoExtractor):
	6	_VALID_URL = r'https?://nos\.nl/(?P<type>video\|(\w+/)?\w+)/?\d+-(?P<display_id>[\w-]+)'
	7	_TESTS = [
	8	{
	9	# only 1 video
	10	'url': 'https://nos.nl/nieuwsuur/artikel/2440353-verzakking-door-droogte-dreigt-tot-een-miljoen-kwetsbare-huizen',
	11	'info_dict': {
	12	'id': '2440340',
	13	'ext': 'mp4',
	14	'description': 'md5:5f83185d902ac97af3af4bed7ece3db5',
	15	'title': '\'We hebben een huis vol met scheuren\'',
	16	'duration': 95.0,
	17	'thumbnail': 'https://cdn.nos.nl/image/2022/08/12/887149/3840x2160a.jpg',
	18	}
	19	}, {
	20	# more than 1 video
	21	'url': 'https://nos.nl/artikel/2440409-vannacht-sliepen-weer-enkele-honderden-asielzoekers-in-ter-apel-buiten',
	22	'info_dict': {
	23	'id': '2440409',
	24	'title': 'Vannacht sliepen weer enkele honderden asielzoekers in Ter Apel buiten',
	25	'description': 'md5:72b1e1674d798460e79d78fa37e9f56d',
	26	'tags': ['aanmeldcentrum', 'Centraal Orgaan opvang asielzoekers', 'COA', 'asielzoekers', 'Ter Apel'],
	27	'modified_timestamp': 1660452773,
	28	'modified_date': '20220814',
	29	'upload_date': '20220813',
	30	'thumbnail': 'https://cdn.nos.nl/image/2022/07/18/880346/1024x576a.jpg',
	31	'timestamp': 1660401384,
	32	'categories': ['Regionaal nieuws', 'Binnenland'],
	33	},
	34	'playlist_count': 2,
	35	}, {
	36	# audio + video
	37	'url': 'https://nos.nl/artikel/2440789-wekdienst-16-8-groningse-acties-tien-jaar-na-zware-aardbeving-femke-bol-in-actie-op-ek-atletiek',
	38	'info_dict': {
	39	'id': '2440789',
	40	'title': 'Wekdienst 16/8: Groningse acties tien jaar na zware aardbeving • Femke Bol in actie op EK atletiek ',
	41	'description': 'md5:0bd277ed7a44fc15cb12a9d27d8f6641',
	42	'tags': ['wekdienst'],
	43	'modified_date': '20220816',
	44	'modified_timestamp': 1660625449,
	45	'timestamp': 1660625449,
	46	'upload_date': '20220816',
	47	'thumbnail': 'https://cdn.nos.nl/image/2022/08/16/888178/1024x576a.jpg',
	48	'categories': ['Binnenland', 'Buitenland'],
	49	},
	50	'playlist_count': 2,
	51	}, {
	52	# video url
	53	'url': 'https://nos.nl/video/2452718-xi-en-trudeau-botsen-voor-de-camera-op-g20-top-je-hebt-gelekt',
	54	'info_dict': {
	55	'id': '2452718',
	56	'title': 'Xi en Trudeau botsen voor de camera op G20-top: \'Je hebt gelekt\'',
	57	'modified_date': '20221117',
	58	'description': 'md5:61907dac576f75c11bf8ffffd4a3cc0f',
	59	'tags': ['Xi', 'Trudeau', 'G20', 'indonesié'],
	60	'upload_date': '20221117',
	61	'thumbnail': 'https://cdn.nos.nl/image/2022/11/17/916155/1024x576a.jpg',
	62	'modified_timestamp': 1668663388,
	63	'timestamp': 1668663388,
	64	'categories': ['Buitenland'],
	65	},
	66	'playlist_mincount': 1,
	67	}
	68	]
	69
	70	def _entries(self, nextjs_json, display_id):
	71	for item in nextjs_json:
	72	if item.get('type') == 'video':
	73	formats, subtitle = self._extract_m3u8_formats_and_subtitles(
	74	traverse_obj(item, ('source', 'url')), display_id, ext='mp4')
	75	yield {
	76	'id': str(item['id']),
	77	'title': item.get('title'),
	78	'description': item.get('description'),
	79	'formats': formats,
	80	'subtitles': subtitle,
	81	'duration': parse_duration(item.get('duration')),
	82	'thumbnails': [{
	83	'url': traverse_obj(image, ('url', ...), get_all=False),
	84	'width': image.get('width'),
	85	'height': image.get('height')
	86	} for image in traverse_obj(item, ('imagesByRatio', ...))[0]],
	87	}
	88
	89	elif item.get('type') == 'audio':
	90	yield {
	91	'id': str(item['id']),
	92	'title': item.get('title'),
	93	'url': traverse_obj(item, ('media', 'src')),
	94	'ext': 'mp3',
	95	}
	96
	97	def _real_extract(self, url):
	98	site_type, display_id = self._match_valid_url(url).group('type', 'display_id')
	99	webpage = self._download_webpage(url, display_id)
	100
	101	nextjs_json = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['data']
	102	return {
	103	'_type': 'playlist',
	104	'entries': self._entries(
	105	[nextjs_json['video']] if site_type == 'video' else nextjs_json['items'], display_id),
	106	'id': str(nextjs_json['id']),
	107	'title': nextjs_json.get('title') or self._html_search_meta(['title', 'og:title', 'twitter:title'], webpage),
	108	'description': (nextjs_json.get('description')
	109	or self._html_search_meta(['description', 'twitter:description', 'og:description'], webpage)),
	110	'tags': nextjs_json.get('keywords'),
	111	'modified_timestamp': parse_iso8601(nextjs_json.get('modifiedAt')),
	112	'thumbnail': nextjs_json.get('shareImageSrc') or self._html_search_meta(['og:image', 'twitter:image'], webpage),
	113	'timestamp': parse_iso8601(nextjs_json.get('publishedAt')),
	114	'categories': traverse_obj(nextjs_json, ('categories', ..., 'label')),
	115	}

1

from .common import InfoExtractor

2

from ..utils import parse_duration, parse_iso8601, traverse_obj

3

4

5

class NOSNLArticleIE(InfoExtractor):

6

_VALID_URL = r'https?://nos\.nl/(?P<type>video|(\w+/)?\w+)/?\d+-(?P<display_id>[\w-]+)'

_TESTS = [

{

# only 1 video

'url': 'https://nos.nl/nieuwsuur/artikel/2440353-verzakking-door-droogte-dreigt-tot-een-miljoen-kwetsbare-huizen',

'info_dict': {

'id': '2440340',

'ext': 'mp4',

'description': 'md5:5f83185d902ac97af3af4bed7ece3db5',

15

'title': '\'We hebben een huis vol met scheuren\'',

16

'duration': 95.0,

17

'thumbnail': 'https://cdn.nos.nl/image/2022/08/12/887149/3840x2160a.jpg',

}

}, {

# more than 1 video

'url': 'https://nos.nl/artikel/2440409-vannacht-sliepen-weer-enkele-honderden-asielzoekers-in-ter-apel-buiten',

22

'info_dict': {

23

'id': '2440409',

24

'title': 'Vannacht sliepen weer enkele honderden asielzoekers in Ter Apel buiten',

25

'description': 'md5:72b1e1674d798460e79d78fa37e9f56d',

26

'tags': ['aanmeldcentrum', 'Centraal Orgaan opvang asielzoekers', 'COA', 'asielzoekers', 'Ter Apel'],

27

'modified_timestamp': 1660452773,

28

'modified_date': '20220814',

29

'upload_date': '20220813',

30

'thumbnail': 'https://cdn.nos.nl/image/2022/07/18/880346/1024x576a.jpg',

31

'timestamp': 1660401384,

32

'categories': ['Regionaal nieuws', 'Binnenland'],

},

'playlist_count': 2,

}, {

# audio + video

'url': 'https://nos.nl/artikel/2440789-wekdienst-16-8-groningse-acties-tien-jaar-na-zware-aardbeving-femke-bol-in-actie-op-ek-atletiek',

38

'info_dict': {

39

'id': '2440789',

40

'title': 'Wekdienst 16/8: Groningse acties tien jaar na zware aardbeving • Femke Bol in actie op EK atletiek ',

41

'description': 'md5:0bd277ed7a44fc15cb12a9d27d8f6641',

42

'tags': ['wekdienst'],

43

'modified_date': '20220816',

44

'modified_timestamp': 1660625449,

45

'timestamp': 1660625449,

46

'upload_date': '20220816',

47

'thumbnail': 'https://cdn.nos.nl/image/2022/08/16/888178/1024x576a.jpg',

48

'categories': ['Binnenland', 'Buitenland'],

},

'playlist_count': 2,

}, {

# video url

'url': 'https://nos.nl/video/2452718-xi-en-trudeau-botsen-voor-de-camera-op-g20-top-je-hebt-gelekt',

54

'info_dict': {

55

'id': '2452718',

56

'title': 'Xi en Trudeau botsen voor de camera op G20-top: \'Je hebt gelekt\'',

57

'modified_date': '20221117',

58

'description': 'md5:61907dac576f75c11bf8ffffd4a3cc0f',

59

'tags': ['Xi', 'Trudeau', 'G20', 'indonesié'],

60

'upload_date': '20221117',

61

'thumbnail': 'https://cdn.nos.nl/image/2022/11/17/916155/1024x576a.jpg',

62

'modified_timestamp': 1668663388,

63

'timestamp': 1668663388,

64

'categories': ['Buitenland'],

65

},

66

'playlist_mincount': 1,

}

]

def _entries(self, nextjs_json, display_id):

71

for item in nextjs_json:

72

if item.get('type') == 'video':

73

formats, subtitle = self._extract_m3u8_formats_and_subtitles(

74

traverse_obj(item, ('source', 'url')), display_id, ext='mp4')

75

yield {

76

'id': str(item['id']),

77

'title': item.get('title'),

78

'description': item.get('description'),

79

'formats': formats,

80

'subtitles': subtitle,

81

'duration': parse_duration(item.get('duration')),

82

'thumbnails': [{

83

'url': traverse_obj(image, ('url', ...), get_all=False),

84

'width': image.get('width'),

85

'height': image.get('height')

86

} for image in traverse_obj(item, ('imagesByRatio', ...))[0]],

87

}

88

89

elif item.get('type') == 'audio':

90

yield {

91

'id': str(item['id']),

92

'title': item.get('title'),

93

'url': traverse_obj(item, ('media', 'src')),

'ext': 'mp3',

}

def _real_extract(self, url):

98

site_type, display_id = self._match_valid_url(url).group('type', 'display_id')

99

webpage = self._download_webpage(url, display_id)

100

101

nextjs_json = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['data']

102

return {

103

'_type': 'playlist',

104

'entries': self._entries(

105

[nextjs_json['video']] if site_type == 'video' else nextjs_json['items'], display_id),

106

'id': str(nextjs_json['id']),

107

'title': nextjs_json.get('title') or self._html_search_meta(['title', 'og:title', 'twitter:title'], webpage),

108

'description': (nextjs_json.get('description')

109

or self._html_search_meta(['description', 'twitter:description', 'og:description'], webpage)),

110

'tags': nextjs_json.get('keywords'),

111

'modified_timestamp': parse_iso8601(nextjs_json.get('modifiedAt')),

112

'thumbnail': nextjs_json.get('shareImageSrc') or self._html_search_meta(['og:image', 'twitter:image'], webpage),

113

'timestamp': parse_iso8601(nextjs_json.get('publishedAt')),

114

'categories': traverse_obj(nextjs_json, ('categories', ..., 'label')),

115

}