]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/onenewsnz.py
[ie/orf:on] Improve extraction (#9677)
[yt-dlp.git] / yt_dlp / extractor / onenewsnz.py
1 from .brightcove import BrightcoveNewIE
2 from .common import InfoExtractor
3
4 from ..utils import (
5 ExtractorError,
6 traverse_obj
7 )
8
9
10 class OneNewsNZIE(InfoExtractor):
11 IE_NAME = '1News'
12 IE_DESC = '1news.co.nz article videos'
13 _VALID_URL = r'https?://(?:www\.)?(?:1|one)news\.co\.nz/\d+/\d+/\d+/(?P<id>[^/?#&]+)'
14 _TESTS = [
15 { # Brightcove video
16 'url': 'https://www.1news.co.nz/2022/09/29/cows-painted-green-on-parliament-lawn-in-climate-protest/',
17 'info_dict': {
18 'id': 'cows-painted-green-on-parliament-lawn-in-climate-protest',
19 'title': '\'Cows\' painted green on Parliament lawn in climate protest',
20 },
21 'playlist': [{
22 'info_dict': {
23 'id': '6312993358112',
24 'title': 'Activists dressed as cows painted green outside Parliament in climate protest',
25 'ext': 'mp4',
26 'tags': 'count:6',
27 'uploader_id': '963482464001',
28 'timestamp': 1664416255,
29 'upload_date': '20220929',
30 'duration': 38.272,
31 'thumbnail': r're:^https?://.*\.jpg$',
32 'description': 'Greenpeace accused the Government of "greenwashing" instead of taking climate action.',
33 }
34 }]
35 }, {
36 # YouTube video
37 'url': 'https://www.1news.co.nz/2022/09/30/now-is-the-time-to-care-about-womens-rugby/',
38 'info_dict': {
39 'id': 'now-is-the-time-to-care-about-womens-rugby',
40 'title': 'Now is the time to care about women\'s rugby',
41 },
42 'playlist': [{
43 'info_dict': {
44 'id': 's4wEB9neTfU',
45 'title': 'Why I love women’s rugby: Black Fern Ruahei Demant',
46 'ext': 'mp4',
47 'channel_follower_count': int,
48 'channel_url': 'https://www.youtube.com/channel/UC2BQ3U9IxoYIJyulv0bN5PQ',
49 'tags': 'count:12',
50 'uploader': 'Re: News',
51 'upload_date': '20211215',
52 'uploader_id': 'UC2BQ3U9IxoYIJyulv0bN5PQ',
53 'uploader_url': 'http://www.youtube.com/channel/UC2BQ3U9IxoYIJyulv0bN5PQ',
54 'channel_id': 'UC2BQ3U9IxoYIJyulv0bN5PQ',
55 'channel': 'Re: News',
56 'like_count': int,
57 'thumbnail': 'https://i.ytimg.com/vi/s4wEB9neTfU/maxresdefault.jpg',
58 'age_limit': 0,
59 'view_count': int,
60 'categories': ['Sports'],
61 'duration': 222,
62 'description': 'md5:8874410e5740ed1d8fd0df839f849813',
63 'availability': 'public',
64 'playable_in_embed': True,
65 'live_status': 'not_live',
66 }
67 }]
68 }, {
69 # 2 Brightcove videos
70 'url': 'https://www.1news.co.nz/2022/09/29/raw-videos-capture-hurricane-ians-fury-as-it-slams-florida/',
71 'info_dict': {
72 'id': 'raw-videos-capture-hurricane-ians-fury-as-it-slams-florida',
73 'title': 'Raw videos capture Hurricane Ian\'s fury as it slams Florida',
74 },
75 'playlist_mincount': 2,
76 }, {
77 'url': 'https://www.onenews.co.nz/2022/09/29/cows-painted-green-on-parliament-lawn-in-climate-protest/',
78 'only_matching': True,
79 }]
80
81 BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/0xpHIR6IB_default/index.html?videoId=%s'
82
83 def _real_extract(self, url):
84 display_id = self._match_id(url)
85 webpage = self._download_webpage(url, display_id)
86
87 fusion_metadata = self._search_json(r'Fusion\.globalContent\s*=', webpage, 'fusion metadata', display_id)
88
89 entries = []
90 for item in traverse_obj(fusion_metadata, 'content_elements') or []:
91 item_type = traverse_obj(item, 'subtype')
92 if item_type == 'video':
93 brightcove_config = traverse_obj(item, ('embed', 'config'))
94 brightcove_url = self.BRIGHTCOVE_URL_TEMPLATE % (
95 traverse_obj(brightcove_config, 'brightcoveAccount') or '963482464001',
96 traverse_obj(brightcove_config, 'brightcoveVideoId')
97 )
98 entries.append(self.url_result(brightcove_url, BrightcoveNewIE))
99 elif item_type == 'youtube':
100 video_id_or_url = traverse_obj(item, ('referent', 'id'), ('raw_oembed', '_id'))
101 if video_id_or_url:
102 entries.append(self.url_result(video_id_or_url, ie='Youtube'))
103
104 if not entries:
105 raise ExtractorError('This article does not have a video.', expected=True)
106
107 playlist_title = (
108 traverse_obj(fusion_metadata, ('headlines', 'basic'))
109 or self._generic_title('', webpage)
110 )
111 return self.playlist_result(entries, display_id, playlist_title)