]>
Commit | Line | Data |
---|---|---|
32b95bb6 ZM |
1 | import re |
2 | ||
3 | from .common import InfoExtractor | |
4 | from ..utils import ( | |
5 | clean_html, | |
6 | determine_ext, | |
7 | ExtractorError, | |
8 | extract_attributes, | |
9 | get_element_by_class, | |
10 | get_element_html_by_id, | |
11 | HEADRequest, | |
12 | parse_qs, | |
13 | unescapeHTML, | |
14 | unified_timestamp, | |
15 | ) | |
16 | ||
17 | ||
18 | class MegaTVComBaseIE(InfoExtractor): | |
19 | _PLAYER_DIV_ID = 'player_div_id' | |
20 | ||
21 | def _extract_player_attrs(self, webpage): | |
22 | player_el = get_element_html_by_id(self._PLAYER_DIV_ID, webpage) | |
23 | return { | |
24 | re.sub(r'^data-(?:kwik_)?', '', k): v | |
25 | for k, v in extract_attributes(player_el).items() | |
26 | if k not in ('id',) | |
27 | } | |
28 | ||
29 | ||
30 | class MegaTVComIE(MegaTVComBaseIE): | |
31 | IE_NAME = 'megatvcom' | |
32 | IE_DESC = 'megatv.com videos' | |
33 | _VALID_URL = r'https?://(?:www\.)?megatv\.com/(?:\d{4}/\d{2}/\d{2}|[^/]+/(?P<id>\d+))/(?P<slug>[^/]+)' | |
34 | ||
35 | _TESTS = [{ | |
36 | 'url': 'https://www.megatv.com/2021/10/23/egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia/', | |
37 | 'md5': '6546a1a37fff0dd51c9dce5f490b7d7d', | |
38 | 'info_dict': { | |
39 | 'id': '520979', | |
40 | 'ext': 'mp4', | |
41 | 'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2', | |
42 | 'description': 'md5:0209fa8d318128569c0d256a5c404db1', | |
43 | 'timestamp': 1634975747, | |
44 | 'upload_date': '20211023', | |
45 | 'display_id': 'egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia', | |
46 | 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/10/ΠΕΙΡΑΙΑΣ-1024x450.jpg', | |
47 | }, | |
48 | }, { | |
49 | 'url': 'https://www.megatv.com/tvshows/527800/epeisodio-65-12/', | |
50 | 'md5': 'cba2085d45c1abeb8e7e9b7e1d6c0072', | |
51 | 'info_dict': { | |
52 | 'id': '527800', | |
53 | 'ext': 'mp4', | |
54 | 'title': 'md5:fc322cb51f682eecfe2f54cd5ab3a157', | |
55 | 'description': 'md5:b2b7ed3690a78f2a0156eb790fdc00df', | |
56 | 'timestamp': 1636048859, | |
57 | 'upload_date': '20211104', | |
58 | 'display_id': 'epeisodio-65-12', | |
59 | 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/11/16-1-1.jpg', | |
60 | }, | |
61 | }] | |
62 | ||
63 | def _real_extract(self, url): | |
64 | video_id, display_id = self._match_valid_url(url).group('id', 'slug') | |
65 | _is_article = video_id is None | |
66 | webpage = self._download_webpage(url, video_id or display_id) | |
67 | if _is_article: | |
68 | video_id = self._search_regex( | |
69 | r'<article[^>]*\sid=["\']Article_(\d+)["\']', webpage, 'article id') | |
70 | player_attrs = self._extract_player_attrs(webpage) | |
71 | title = player_attrs.get('label') or self._og_search_title(webpage) | |
72 | description = get_element_by_class( | |
73 | 'article-wrapper' if _is_article else 'story_content', | |
74 | webpage) | |
75 | description = clean_html(re.sub(r'<script[^>]*>[^<]+</script>', '', description)) | |
76 | if not description: | |
77 | description = self._og_search_description(webpage) | |
78 | thumbnail = player_attrs.get('image') or self._og_search_thumbnail(webpage) | |
79 | timestamp = unified_timestamp(self._html_search_meta( | |
80 | 'article:published_time', webpage)) | |
81 | source = player_attrs.get('source') | |
82 | if not source: | |
83 | raise ExtractorError('No source found', video_id=video_id) | |
84 | if determine_ext(source) == 'm3u8': | |
85 | formats, subs = self._extract_m3u8_formats_and_subtitles(source, video_id, 'mp4') | |
86 | else: | |
87 | formats, subs = [{'url': source}], {} | |
88 | if player_attrs.get('subs'): | |
89 | self._merge_subtitles({'und': [{'url': player_attrs['subs']}]}, target=subs) | |
32b95bb6 ZM |
90 | return { |
91 | 'id': video_id, | |
92 | 'display_id': display_id, | |
93 | 'title': title, | |
94 | 'description': description, | |
95 | 'thumbnail': thumbnail, | |
96 | 'timestamp': timestamp, | |
97 | 'formats': formats, | |
98 | 'subtitles': subs, | |
99 | } | |
100 | ||
101 | ||
102 | class MegaTVComEmbedIE(MegaTVComBaseIE): | |
103 | IE_NAME = 'megatvcom:embed' | |
104 | IE_DESC = 'megatv.com embedded videos' | |
105 | _VALID_URL = r'(?:https?:)?//(?:www\.)?megatv\.com/embed/?\?p=(?P<id>\d+)' | |
bfd973ec | 106 | _EMBED_REGEX = [rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)'''] |
32b95bb6 ZM |
107 | |
108 | _TESTS = [{ | |
109 | 'url': 'https://www.megatv.com/embed/?p=2020520979', | |
110 | 'md5': '6546a1a37fff0dd51c9dce5f490b7d7d', | |
111 | 'info_dict': { | |
112 | 'id': '520979', | |
113 | 'ext': 'mp4', | |
114 | 'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2', | |
115 | 'description': 'md5:0209fa8d318128569c0d256a5c404db1', | |
116 | 'timestamp': 1634975747, | |
117 | 'upload_date': '20211023', | |
118 | 'display_id': 'egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia', | |
119 | 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/10/ΠΕΙΡΑΙΑΣ-1024x450.jpg', | |
120 | }, | |
121 | }, { | |
122 | 'url': 'https://www.megatv.com/embed/?p=2020534081', | |
123 | 'md5': '6ac8b3ce4dc6120c802f780a1e6b3812', | |
124 | 'info_dict': { | |
125 | 'id': '534081', | |
126 | 'ext': 'mp4', | |
127 | 'title': 'md5:062e9d5976ef854d8bdc1f5724d9b2d0', | |
128 | 'description': 'md5:36dbe4c3762d2ede9513eea8d07f6d52', | |
129 | 'timestamp': 1636376351, | |
130 | 'upload_date': '20211108', | |
131 | 'display_id': 'neo-rekor-stin-timi-tou-ilektrikou-reymatos-pano-apo-ta-200e-i-xondriki-timi-tou-ilektrikou', | |
132 | 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/11/Capture-266.jpg', | |
133 | }, | |
134 | }] | |
135 | ||
32b95bb6 ZM |
136 | def _match_canonical_url(self, webpage): |
137 | LINK_RE = r'''(?x) | |
138 | <link(?: | |
139 | rel=(?P<_q1>["'])(?P<canonical>canonical)(?P=_q1)| | |
140 | href=(?P<_q2>["'])(?P<href>(?:(?!(?P=_q2)).)+)(?P=_q2)| | |
141 | [^>]*? | |
142 | )+> | |
143 | ''' | |
144 | for mobj in re.finditer(LINK_RE, webpage): | |
145 | canonical, href = mobj.group('canonical', 'href') | |
146 | if canonical and href: | |
147 | return unescapeHTML(href) | |
148 | ||
149 | def _real_extract(self, url): | |
150 | video_id = self._match_id(url) | |
151 | webpage = self._download_webpage(url, video_id) | |
152 | player_attrs = self._extract_player_attrs(webpage) | |
153 | canonical_url = player_attrs.get('share_url') or self._match_canonical_url(webpage) | |
154 | if not canonical_url: | |
155 | raise ExtractorError('canonical URL not found') | |
156 | video_id = parse_qs(canonical_url)['p'][0] | |
157 | ||
158 | # Defer to megatvcom as the metadata extracted from the embeddable page some | |
159 | # times are slightly different, for the same video | |
160 | canonical_url = self._request_webpage( | |
161 | HEADRequest(canonical_url), video_id, | |
162 | note='Resolve canonical URL', | |
163 | errnote='Could not resolve canonical URL').geturl() | |
164 | return self.url_result(canonical_url, MegaTVComIE.ie_key(), video_id) |