]>
Commit | Line | Data |
---|---|---|
53006b35 | 1 | import re |
2 | ||
3c4eebf7 | 3 | from .common import InfoExtractor |
53006b35 | 4 | from ..utils import ( |
5 | ExtractorError, | |
6 | clean_html, | |
7 | float_or_none, | |
8 | get_element_by_attribute, | |
9 | get_element_by_class, | |
10 | int_or_none, | |
11 | js_to_json, | |
12 | traverse_obj, | |
13 | url_or_none, | |
14 | ) | |
3c4eebf7 | 15 | |
16 | ||
17 | class AmazonStoreIE(InfoExtractor): | |
73f035e1 | 18 | _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/(?:[^/]+/)?(?:dp|gp/product)/(?P<id>[^/&#$?]+)' |
3c4eebf7 | 19 | |
20 | _TESTS = [{ | |
21 | 'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/', | |
22 | 'info_dict': { | |
23 | 'id': 'B098XNCHLD', | |
53006b35 | 24 | 'title': str, |
3c4eebf7 | 25 | }, |
26 | 'playlist_mincount': 1, | |
27 | 'playlist': [{ | |
28 | 'info_dict': { | |
29 | 'id': 'A1F83G8C2ARO7P', | |
30 | 'ext': 'mp4', | |
31 | 'title': 'mcdodo usb c cable 100W 5a', | |
32 | 'thumbnail': r're:^https?://.*\.jpg$', | |
7474e453 | 33 | 'duration': 34, |
3c4eebf7 | 34 | }, |
53006b35 | 35 | }], |
36 | 'expected_warnings': ['Unable to extract data'], | |
3c4eebf7 | 37 | }, { |
38 | 'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3', | |
39 | 'info_dict': { | |
40 | 'id': 'B0863TXGM3', | |
53006b35 | 41 | 'title': str, |
3c4eebf7 | 42 | }, |
43 | 'playlist_mincount': 4, | |
53006b35 | 44 | 'expected_warnings': ['Unable to extract data'], |
3c4eebf7 | 45 | }, { |
46 | 'url': 'https://www.amazon.com/dp/B0845NXCXF/', | |
47 | 'info_dict': { | |
48 | 'id': 'B0845NXCXF', | |
53006b35 | 49 | 'title': str, |
3c4eebf7 | 50 | }, |
51 | 'playlist-mincount': 1, | |
53006b35 | 52 | 'expected_warnings': ['Unable to extract data'], |
7474e453 | 53 | }, { |
54 | 'url': 'https://www.amazon.es/Samsung-Smartphone-s-AMOLED-Quad-c%C3%A1mara-espa%C3%B1ola/dp/B08WX337PQ', | |
55 | 'info_dict': { | |
56 | 'id': 'B08WX337PQ', | |
53006b35 | 57 | 'title': str, |
7474e453 | 58 | }, |
59 | 'playlist_mincount': 1, | |
53006b35 | 60 | 'expected_warnings': ['Unable to extract data'], |
3c4eebf7 | 61 | }] |
62 | ||
63 | def _real_extract(self, url): | |
64 | id = self._match_id(url) | |
3c7a2762 | 65 | |
8ca48a1a | 66 | for retry in self.RetryManager(): |
3c7a2762 L |
67 | webpage = self._download_webpage(url, id) |
68 | try: | |
7474e453 | 69 | data_json = self._search_json( |
70 | r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'', webpage, 'data', id, | |
53006b35 | 71 | transform_source=js_to_json) |
3c7a2762 L |
72 | except ExtractorError as e: |
73 | retry.error = e | |
74 | ||
3c4eebf7 | 75 | entries = [{ |
76 | 'id': video['marketPlaceID'], | |
77 | 'url': video['url'], | |
78 | 'title': video.get('title'), | |
79 | 'thumbnail': video.get('thumbUrl') or video.get('thumb'), | |
80 | 'duration': video.get('durationSeconds'), | |
81 | 'height': int_or_none(video.get('videoHeight')), | |
82 | 'width': int_or_none(video.get('videoWidth')), | |
83 | } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')] | |
7474e453 | 84 | return self.playlist_result(entries, playlist_id=id, playlist_title=data_json.get('title')) |
53006b35 | 85 | |
86 | ||
87 | class AmazonReviewsIE(InfoExtractor): | |
88 | _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/gp/customer-reviews/(?P<id>[^/&#$?]+)' | |
89 | _TESTS = [{ | |
90 | 'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl', | |
91 | 'info_dict': { | |
92 | 'id': 'R10VE9VUSY19L3', | |
93 | 'ext': 'mp4', | |
94 | 'title': 'Get squad #Suspicious', | |
95 | 'description': 'md5:7012695052f440a1e064e402d87e0afb', | |
96 | 'uploader': 'Kimberly Cronkright', | |
97 | 'average_rating': 1.0, | |
98 | 'thumbnail': r're:^https?://.*\.jpg$', | |
99 | }, | |
100 | 'expected_warnings': ['Review body was not found in webpage'], | |
101 | }, { | |
102 | 'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl?language=es_US', | |
103 | 'info_dict': { | |
104 | 'id': 'R10VE9VUSY19L3', | |
105 | 'ext': 'mp4', | |
106 | 'title': 'Get squad #Suspicious', | |
107 | 'description': 'md5:7012695052f440a1e064e402d87e0afb', | |
108 | 'uploader': 'Kimberly Cronkright', | |
109 | 'average_rating': 1.0, | |
110 | 'thumbnail': r're:^https?://.*\.jpg$', | |
111 | }, | |
112 | 'expected_warnings': ['Review body was not found in webpage'], | |
113 | }, { | |
114 | 'url': 'https://www.amazon.in/gp/customer-reviews/RV1CO8JN5VGXV/', | |
115 | 'info_dict': { | |
116 | 'id': 'RV1CO8JN5VGXV', | |
117 | 'ext': 'mp4', | |
118 | 'title': 'Not sure about its durability', | |
119 | 'description': 'md5:1a252c106357f0a3109ebf37d2e87494', | |
120 | 'uploader': 'Shoaib Gulzar', | |
121 | 'average_rating': 2.0, | |
122 | 'thumbnail': r're:^https?://.*\.jpg$', | |
123 | }, | |
124 | 'expected_warnings': ['Review body was not found in webpage'], | |
125 | }] | |
126 | ||
127 | def _real_extract(self, url): | |
128 | video_id = self._match_id(url) | |
129 | ||
130 | for retry in self.RetryManager(): | |
131 | webpage = self._download_webpage(url, video_id) | |
132 | review_body = get_element_by_attribute('data-hook', 'review-body', webpage) | |
133 | if not review_body: | |
134 | retry.error = ExtractorError('Review body was not found in webpage', expected=True) | |
135 | ||
136 | formats, subtitles = [], {} | |
137 | ||
138 | manifest_url = self._search_regex( | |
139 | r'data-video-url="([^"]+)"', review_body, 'm3u8 url', default=None) | |
140 | if url_or_none(manifest_url): | |
141 | fmts, subtitles = self._extract_m3u8_formats_and_subtitles( | |
142 | manifest_url, video_id, 'mp4', fatal=False) | |
143 | formats.extend(fmts) | |
144 | ||
145 | video_url = self._search_regex( | |
146 | r'<input[^>]+\bvalue="([^"]+)"[^>]+\bclass="video-url"', review_body, 'mp4 url', default=None) | |
147 | if url_or_none(video_url): | |
148 | formats.append({ | |
149 | 'url': video_url, | |
150 | 'ext': 'mp4', | |
151 | 'format_id': 'http-mp4', | |
152 | }) | |
153 | ||
154 | if not formats: | |
155 | self.raise_no_formats('No video found for this customer review', expected=True) | |
156 | ||
157 | return { | |
158 | 'id': video_id, | |
159 | 'title': (clean_html(get_element_by_attribute('data-hook', 'review-title', webpage)) | |
160 | or self._html_extract_title(webpage)), | |
161 | 'description': clean_html(traverse_obj(re.findall( | |
162 | r'<span(?:\s+class="cr-original-review-content")?>(.+?)</span>', review_body), -1)), | |
163 | 'uploader': clean_html(get_element_by_class('a-profile-name', webpage)), | |
164 | 'average_rating': float_or_none(clean_html(get_element_by_attribute( | |
165 | 'data-hook', 'review-star-rating', webpage) or '').partition(' ')[0]), | |
166 | 'thumbnail': self._search_regex( | |
167 | r'data-thumbnail-url="([^"]+)"', review_body, 'thumbnail', default=None), | |
168 | 'formats': formats, | |
169 | 'subtitles': subtitles, | |
170 | } |