]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/elementorembed.py
[ie/orf:on] Improve extraction (#9677)
[yt-dlp.git] / yt_dlp / extractor / elementorembed.py
CommitLineData
6171b050
K
1import re
2
3from .common import InfoExtractor
4from .vimeo import VimeoIE
5from .youtube import YoutubeIE
6from ..utils import unescapeHTML, url_or_none
7from ..utils.traversal import traverse_obj
8
9
10class ElementorEmbedIE(InfoExtractor):
11 _VALID_URL = False
12 _WEBPAGE_TESTS = [{
13 'url': 'https://capitaltv.cy/2023/12/14/υγεια-και-ζωη-14-12-2023-δρ-ξενια-κωσταντινιδο/',
14 'info_dict': {
15 'id': 'KgzuxwuQwM4',
16 'ext': 'mp4',
17 'title': 'ΥΓΕΙΑ ΚΑΙ ΖΩΗ 14 12 2023 ΔΡ ΞΕΝΙΑ ΚΩΣΤΑΝΤΙΝΙΔΟΥ',
18 'thumbnail': 'https://i.ytimg.com/vi/KgzuxwuQwM4/maxresdefault.jpg',
19 'playable_in_embed': True,
20 'tags': 'count:16',
21 'like_count': int,
22 'channel': 'Capital TV Cyprus',
23 'channel_id': 'UCR8LwVKTLGEXt4ZAErpCMrg',
24 'availability': 'public',
25 'description': 'md5:7a3308a22881aea4612358c4ba121f77',
26 'duration': 2891,
27 'upload_date': '20231214',
28 'uploader_id': '@capitaltvcyprus6389',
29 'live_status': 'not_live',
30 'channel_url': 'https://www.youtube.com/channel/UCR8LwVKTLGEXt4ZAErpCMrg',
31 'uploader_url': 'https://www.youtube.com/@capitaltvcyprus6389',
32 'uploader': 'Capital TV Cyprus',
33 'age_limit': 0,
34 'categories': ['News & Politics'],
35 'view_count': int,
36 'channel_follower_count': int,
37 },
38 }, {
39 'url': 'https://elementor.com/academy/theme-builder-collection/?playlist=76011151&video=9e59909',
40 'info_dict': {
41 'id': '?playlist=76011151&video=9e59909',
42 'title': 'Theme Builder Collection - Academy',
43 'age_limit': 0,
44 'timestamp': 1702196984.0,
45 'upload_date': '20231210',
46 'description': 'md5:7f52c52715ee9e54fd7f82210511673d',
47 'thumbnail': 'https://elementor.com/academy/wp-content/uploads/2021/07/Theme-Builder-1.png',
48 },
49 'playlist_count': 11,
50 'params': {
51 'skip_download': True,
52 },
53 }]
54 _WIDGET_REGEX = r'<div[^>]+class="[^"]*elementor-widget-video(?:-playlist)?[^"]*"[^>]*data-settings="([^"]*)"'
55
56 def _extract_from_webpage(self, url, webpage):
57 for data_settings in re.findall(self._WIDGET_REGEX, webpage):
58 data = self._parse_json(data_settings, None, fatal=False, transform_source=unescapeHTML)
59 if youtube_url := traverse_obj(data, ('youtube_url', {url_or_none})):
60 yield self.url_result(youtube_url, ie=YoutubeIE)
61
62 for video in traverse_obj(data, ('tabs', lambda _, v: v['_id'], {dict})):
63 if youtube_url := traverse_obj(video, ('youtube_url', {url_or_none})):
64 yield self.url_result(youtube_url, ie=YoutubeIE)
65 if vimeo_url := traverse_obj(video, ('vimeo_url', {url_or_none})):
66 yield self.url_result(vimeo_url, ie=VimeoIE)
67 for direct_url in traverse_obj(video, (('hosted_url', 'external_url'), 'url', {url_or_none})):
68 yield {
69 'id': video['_id'],
70 'url': direct_url,
71 'title': video.get('title'),
72 }