]>
Commit | Line | Data |
---|---|---|
6599c725 | 1 | import re |
add96eb9 | 2 | import urllib.parse |
6599c725 | 3 | |
9751a457 | 4 | from .common import InfoExtractor |
5 | from ..aes import aes_decrypt_text | |
1cc79574 | 6 | from ..utils import ( |
9751a457 | 7 | determine_ext, |
8 | format_field, | |
2fb3deec S |
9 | int_or_none, |
10 | str_to_int, | |
9751a457 | 11 | strip_or_none, |
12 | url_or_none, | |
1d45a23b | 13 | ) |
2fb3deec | 14 | |
1d45a23b | 15 | |
9751a457 | 16 | class Tube8IE(InfoExtractor): |
17 | _WORKING = False | |
58e7071a | 18 | _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/(?P<id>\d+)' |
bfd973ec | 19 | _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?tube8\.com/embed/(?:[^/]+/)+\d+)'] |
7dde358a S |
20 | _TESTS = [{ |
21 | 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/', | |
22 | 'md5': '65e20c48e6abff62ed0c3965fff13a39', | |
23 | 'info_dict': { | |
24 | 'id': '229795', | |
25 | 'display_id': 'kasia-music-video', | |
26 | 'ext': 'mp4', | |
27 | 'description': 'hot teen Kasia grinding', | |
28 | 'uploader': 'unknown', | |
29 | 'title': 'Kasia music video', | |
30 | 'age_limit': 18, | |
31 | 'duration': 230, | |
6599c725 S |
32 | 'categories': ['Teen'], |
33 | 'tags': ['dancing'], | |
34 | }, | |
a616f654 | 35 | }, { |
7dde358a S |
36 | 'url': 'http://www.tube8.com/shemale/teen/blonde-cd-gets-kidnapped-by-two-blacks-and-punished-for-being-a-slutty-girl/19569151/', |
37 | 'only_matching': True, | |
38 | }] | |
1d45a23b | 39 | |
9751a457 | 40 | def _extract_info(self, url, fatal=True): |
41 | mobj = self._match_valid_url(url) | |
42 | video_id = mobj.group('id') | |
43 | display_id = (mobj.group('display_id') | |
44 | if 'display_id' in mobj.groupdict() | |
45 | else None) or mobj.group('id') | |
46 | ||
47 | webpage = self._download_webpage( | |
48 | url, display_id, headers={'Cookie': 'age_verified=1'}) | |
49 | ||
50 | formats = [] | |
51 | format_urls = set() | |
52 | ||
53 | title = None | |
54 | thumbnail = None | |
55 | duration = None | |
56 | encrypted = False | |
57 | ||
58 | def extract_format(format_url, height=None): | |
59 | format_url = url_or_none(format_url) | |
60 | if not format_url or not format_url.startswith(('http', '//')): | |
61 | return | |
62 | if format_url in format_urls: | |
63 | return | |
64 | format_urls.add(format_url) | |
65 | tbr = int_or_none(self._search_regex( | |
66 | r'[/_](\d+)[kK][/_]', format_url, 'tbr', default=None)) | |
67 | if not height: | |
68 | height = int_or_none(self._search_regex( | |
69 | r'[/_](\d+)[pP][/_]', format_url, 'height', default=None)) | |
70 | if encrypted: | |
71 | format_url = aes_decrypt_text( | |
72 | video_url, title, 32).decode('utf-8') | |
73 | formats.append({ | |
74 | 'url': format_url, | |
75 | 'format_id': format_field(height, None, '%dp'), | |
76 | 'height': height, | |
77 | 'tbr': tbr, | |
78 | }) | |
79 | ||
80 | flashvars = self._parse_json( | |
81 | self._search_regex( | |
82 | r'flashvars\s*=\s*({.+?});', webpage, | |
83 | 'flashvars', default='{}'), | |
84 | display_id, fatal=False) | |
85 | ||
86 | if flashvars: | |
87 | title = flashvars.get('video_title') | |
88 | thumbnail = flashvars.get('image_url') | |
89 | duration = int_or_none(flashvars.get('video_duration')) | |
90 | encrypted = flashvars.get('encrypted') is True | |
91 | for key, value in flashvars.items(): | |
92 | mobj = re.search(r'quality_(\d+)[pP]', key) | |
93 | if mobj: | |
94 | extract_format(value, int(mobj.group(1))) | |
95 | video_url = flashvars.get('video_url') | |
96 | if video_url and determine_ext(video_url, None): | |
97 | extract_format(video_url) | |
98 | ||
99 | video_url = self._html_search_regex( | |
100 | r'flashvars\.video_url\s*=\s*(["\'])(?P<url>http.+?)\1', | |
101 | webpage, 'video url', default=None, group='url') | |
102 | if video_url: | |
add96eb9 | 103 | extract_format(urllib.parse.unquote(video_url)) |
9751a457 | 104 | |
105 | if not formats: | |
106 | if 'title="This video is no longer available"' in webpage: | |
107 | self.raise_no_formats( | |
add96eb9 | 108 | f'Video {video_id} is no longer available', expected=True) |
9751a457 | 109 | |
110 | if not title: | |
111 | title = self._html_search_regex( | |
112 | r'<h1[^>]*>([^<]+)', webpage, 'title') | |
113 | ||
114 | return webpage, { | |
115 | 'id': video_id, | |
116 | 'display_id': display_id, | |
117 | 'title': strip_or_none(title), | |
118 | 'thumbnail': thumbnail, | |
119 | 'duration': duration, | |
120 | 'age_limit': 18, | |
121 | 'formats': formats, | |
122 | } | |
123 | ||
1d45a23b | 124 | def _real_extract(self, url): |
8804f10e | 125 | webpage, info = self._extract_info(url) |
2fb3deec | 126 | |
8804f10e S |
127 | if not info['title']: |
128 | info['title'] = self._html_search_regex( | |
129 | r'videoTitle\s*=\s*"([^"]+)', webpage, 'title') | |
2fb3deec | 130 | |
2fb3deec | 131 | description = self._html_search_regex( |
79facb27 | 132 | r'(?s)Description:</dt>\s*<dd>(.+?)</dd>', webpage, 'description', fatal=False) |
2fb3deec | 133 | uploader = self._html_search_regex( |
6ebdfe43 | 134 | r'<span class="username">\s*(.+?)\s*<', |
2fb3deec S |
135 | webpage, 'uploader', fatal=False) |
136 | ||
7dde358a | 137 | like_count = int_or_none(self._search_regex( |
6ebdfe43 | 138 | r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False)) |
7dde358a | 139 | dislike_count = int_or_none(self._search_regex( |
6ebdfe43 | 140 | r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False)) |
7dde358a | 141 | view_count = str_to_int(self._search_regex( |
79facb27 | 142 | r'Views:\s*</dt>\s*<dd>([\d,\.]+)', |
7dde358a S |
143 | webpage, 'view count', fatal=False)) |
144 | comment_count = str_to_int(self._search_regex( | |
145 | r'<span id="allCommentsCount">(\d+)</span>', | |
146 | webpage, 'comment count', fatal=False)) | |
1d45a23b | 147 | |
6599c725 | 148 | category = self._search_regex( |
79facb27 | 149 | r'Category:\s*</dt>\s*<dd>\s*<a[^>]+href=[^>]+>([^<]+)', |
6599c725 S |
150 | webpage, 'category', fatal=False) |
151 | categories = [category] if category else None | |
152 | ||
153 | tags_str = self._search_regex( | |
79facb27 | 154 | r'(?s)Tags:\s*</dt>\s*<dd>(.+?)</(?!a)', |
6599c725 | 155 | webpage, 'tags', fatal=False) |
add96eb9 | 156 | tags = list(re.findall( |
157 | r'<a[^>]+href=[^>]+>([^<]+)', tags_str)) if tags_str else None | |
6599c725 | 158 | |
8804f10e | 159 | info.update({ |
2fb3deec | 160 | 'description': description, |
2fb3deec | 161 | 'uploader': uploader, |
2fb3deec S |
162 | 'view_count': view_count, |
163 | 'like_count': like_count, | |
164 | 'dislike_count': dislike_count, | |
165 | 'comment_count': comment_count, | |
6599c725 S |
166 | 'categories': categories, |
167 | 'tags': tags, | |
8804f10e S |
168 | }) |
169 | ||
170 | return info |