]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/veehd.py
[ie/box] Fix formats extraction (#8649)
[yt-dlp.git] / yt_dlp / extractor / veehd.py
1 import re
2 import json
3
4 from .common import InfoExtractor
5 from ..compat import (
6 compat_urllib_parse_unquote,
7 compat_urlparse,
8 )
9 from ..utils import (
10 ExtractorError,
11 clean_html,
12 get_element_by_id,
13 )
14
15
16 class VeeHDIE(InfoExtractor):
17 _VALID_URL = r'https?://veehd\.com/video/(?P<id>\d+)'
18
19 # Seems VeeHD videos have multiple copies on several servers, all of
20 # whom have different MD5 checksums, so omit md5 field in all tests
21 _TESTS = [{
22 'url': 'http://veehd.com/video/4639434_Solar-Sinter',
23 'info_dict': {
24 'id': '4639434',
25 'ext': 'mp4',
26 'title': 'Solar Sinter',
27 'uploader_id': 'VideoEyes',
28 'description': 'md5:46a840e8692ddbaffb5f81d9885cb457',
29 },
30 'skip': 'Video deleted',
31 }, {
32 'url': 'http://veehd.com/video/4905758_Elysian-Fields-Channeling',
33 'info_dict': {
34 'id': '4905758',
35 'ext': 'mp4',
36 'title': 'Elysian Fields - Channeling',
37 'description': 'md5:360e4e95fdab58aefbea0f2a19e5604b',
38 'uploader_id': 'spotted',
39 }
40 }, {
41 'url': 'http://veehd.com/video/2046729_2012-2009-DivX-Trailer',
42 'info_dict': {
43 'id': '2046729',
44 'ext': 'avi',
45 'title': '2012 (2009) DivX Trailer',
46 'description': 'md5:75435ee95255e6a9838ac6f6f3a2396b',
47 'uploader_id': 'Movie_Trailers',
48 }
49 }]
50
51 def _real_extract(self, url):
52 video_id = self._match_id(url)
53
54 # VeeHD seems to send garbage on the first request.
55 # See https://github.com/ytdl-org/youtube-dl/issues/2102
56 self._download_webpage(url, video_id, 'Requesting webpage')
57 webpage = self._download_webpage(url, video_id)
58
59 if 'This video has been removed<' in webpage:
60 raise ExtractorError('Video %s has been removed' % video_id, expected=True)
61
62 player_path = self._search_regex(
63 r'\$\("#playeriframe"\).attr\({src : "(.+?)"',
64 webpage, 'player path')
65 player_url = compat_urlparse.urljoin(url, player_path)
66
67 self._download_webpage(player_url, video_id, 'Requesting player page')
68 player_page = self._download_webpage(
69 player_url, video_id, 'Downloading player page')
70
71 video_url = None
72
73 config_json = self._search_regex(
74 r'value=\'config=({.+?})\'', player_page, 'config json', default=None)
75
76 if config_json:
77 config = json.loads(config_json)
78 video_url = compat_urllib_parse_unquote(config['clip']['url'])
79
80 if not video_url:
81 video_url = self._html_search_regex(
82 r'<embed[^>]+type="video/divx"[^>]+src="([^"]+)"',
83 player_page, 'video url', default=None)
84
85 if not video_url:
86 iframe_src = self._search_regex(
87 r'<iframe[^>]+src="/?([^"]+)"', player_page, 'iframe url')
88 iframe_url = 'http://veehd.com/%s' % iframe_src
89
90 self._download_webpage(iframe_url, video_id, 'Requesting iframe page')
91 iframe_page = self._download_webpage(
92 iframe_url, video_id, 'Downloading iframe page')
93
94 video_url = self._search_regex(
95 r"file\s*:\s*'([^']+)'", iframe_page, 'video url')
96
97 title = clean_html(get_element_by_id('videoName', webpage).rpartition('|')[0])
98 uploader_id = self._html_search_regex(
99 r'<a href="/profile/\d+">(.+?)</a>',
100 webpage, 'uploader')
101 thumbnail = self._search_regex(
102 r'<img id="veehdpreview" src="(.+?)"',
103 webpage, 'thumbnail')
104 description = self._html_search_regex(
105 r'<td class="infodropdown".*?<div>(.*?)<ul',
106 webpage, 'description', flags=re.DOTALL)
107
108 return {
109 '_type': 'video',
110 'id': video_id,
111 'title': title,
112 'url': video_url,
113 'uploader_id': uploader_id,
114 'thumbnail': thumbnail,
115 'description': description,
116 }