]> jfr.im git - yt-dlp.git/blame - yt_dlp/extractor/amazon.py
[cleanup] Add more ruff rules (#10149)
[yt-dlp.git] / yt_dlp / extractor / amazon.py
CommitLineData
53006b35 1import re
2
3c4eebf7 3from .common import InfoExtractor
53006b35 4from ..utils import (
5 ExtractorError,
6 clean_html,
7 float_or_none,
8 get_element_by_attribute,
9 get_element_by_class,
10 int_or_none,
11 js_to_json,
12 traverse_obj,
13 url_or_none,
14)
3c4eebf7 15
16
17class AmazonStoreIE(InfoExtractor):
73f035e1 18 _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/(?:[^/]+/)?(?:dp|gp/product)/(?P<id>[^/&#$?]+)'
3c4eebf7 19
20 _TESTS = [{
21 'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/',
22 'info_dict': {
23 'id': 'B098XNCHLD',
53006b35 24 'title': str,
3c4eebf7 25 },
26 'playlist_mincount': 1,
27 'playlist': [{
28 'info_dict': {
29 'id': 'A1F83G8C2ARO7P',
30 'ext': 'mp4',
31 'title': 'mcdodo usb c cable 100W 5a',
32 'thumbnail': r're:^https?://.*\.jpg$',
7474e453 33 'duration': 34,
3c4eebf7 34 },
53006b35 35 }],
36 'expected_warnings': ['Unable to extract data'],
3c4eebf7 37 }, {
38 'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3',
39 'info_dict': {
40 'id': 'B0863TXGM3',
53006b35 41 'title': str,
3c4eebf7 42 },
43 'playlist_mincount': 4,
53006b35 44 'expected_warnings': ['Unable to extract data'],
3c4eebf7 45 }, {
46 'url': 'https://www.amazon.com/dp/B0845NXCXF/',
47 'info_dict': {
48 'id': 'B0845NXCXF',
53006b35 49 'title': str,
3c4eebf7 50 },
51 'playlist-mincount': 1,
53006b35 52 'expected_warnings': ['Unable to extract data'],
7474e453 53 }, {
54 'url': 'https://www.amazon.es/Samsung-Smartphone-s-AMOLED-Quad-c%C3%A1mara-espa%C3%B1ola/dp/B08WX337PQ',
55 'info_dict': {
56 'id': 'B08WX337PQ',
53006b35 57 'title': str,
7474e453 58 },
59 'playlist_mincount': 1,
53006b35 60 'expected_warnings': ['Unable to extract data'],
3c4eebf7 61 }]
62
63 def _real_extract(self, url):
add96eb9 64 playlist_id = self._match_id(url)
3c7a2762 65
8ca48a1a 66 for retry in self.RetryManager():
add96eb9 67 webpage = self._download_webpage(url, playlist_id)
3c7a2762 68 try:
7474e453 69 data_json = self._search_json(
add96eb9 70 r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'', webpage, 'data', playlist_id,
53006b35 71 transform_source=js_to_json)
3c7a2762
L
72 except ExtractorError as e:
73 retry.error = e
74
3c4eebf7 75 entries = [{
76 'id': video['marketPlaceID'],
77 'url': video['url'],
78 'title': video.get('title'),
79 'thumbnail': video.get('thumbUrl') or video.get('thumb'),
80 'duration': video.get('durationSeconds'),
81 'height': int_or_none(video.get('videoHeight')),
82 'width': int_or_none(video.get('videoWidth')),
83 } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')]
add96eb9 84 return self.playlist_result(entries, playlist_id=playlist_id, playlist_title=data_json.get('title'))
53006b35 85
86
87class AmazonReviewsIE(InfoExtractor):
88 _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/gp/customer-reviews/(?P<id>[^/&#$?]+)'
89 _TESTS = [{
90 'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl',
91 'info_dict': {
92 'id': 'R10VE9VUSY19L3',
93 'ext': 'mp4',
94 'title': 'Get squad #Suspicious',
95 'description': 'md5:7012695052f440a1e064e402d87e0afb',
96 'uploader': 'Kimberly Cronkright',
97 'average_rating': 1.0,
98 'thumbnail': r're:^https?://.*\.jpg$',
99 },
100 'expected_warnings': ['Review body was not found in webpage'],
101 }, {
102 'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl?language=es_US',
103 'info_dict': {
104 'id': 'R10VE9VUSY19L3',
105 'ext': 'mp4',
106 'title': 'Get squad #Suspicious',
107 'description': 'md5:7012695052f440a1e064e402d87e0afb',
108 'uploader': 'Kimberly Cronkright',
109 'average_rating': 1.0,
110 'thumbnail': r're:^https?://.*\.jpg$',
111 },
112 'expected_warnings': ['Review body was not found in webpage'],
113 }, {
114 'url': 'https://www.amazon.in/gp/customer-reviews/RV1CO8JN5VGXV/',
115 'info_dict': {
116 'id': 'RV1CO8JN5VGXV',
117 'ext': 'mp4',
118 'title': 'Not sure about its durability',
119 'description': 'md5:1a252c106357f0a3109ebf37d2e87494',
120 'uploader': 'Shoaib Gulzar',
121 'average_rating': 2.0,
122 'thumbnail': r're:^https?://.*\.jpg$',
123 },
124 'expected_warnings': ['Review body was not found in webpage'],
125 }]
126
127 def _real_extract(self, url):
128 video_id = self._match_id(url)
129
130 for retry in self.RetryManager():
131 webpage = self._download_webpage(url, video_id)
132 review_body = get_element_by_attribute('data-hook', 'review-body', webpage)
133 if not review_body:
134 retry.error = ExtractorError('Review body was not found in webpage', expected=True)
135
136 formats, subtitles = [], {}
137
138 manifest_url = self._search_regex(
139 r'data-video-url="([^"]+)"', review_body, 'm3u8 url', default=None)
140 if url_or_none(manifest_url):
141 fmts, subtitles = self._extract_m3u8_formats_and_subtitles(
142 manifest_url, video_id, 'mp4', fatal=False)
143 formats.extend(fmts)
144
145 video_url = self._search_regex(
146 r'<input[^>]+\bvalue="([^"]+)"[^>]+\bclass="video-url"', review_body, 'mp4 url', default=None)
147 if url_or_none(video_url):
148 formats.append({
149 'url': video_url,
150 'ext': 'mp4',
151 'format_id': 'http-mp4',
152 })
153
154 if not formats:
155 self.raise_no_formats('No video found for this customer review', expected=True)
156
157 return {
158 'id': video_id,
159 'title': (clean_html(get_element_by_attribute('data-hook', 'review-title', webpage))
160 or self._html_extract_title(webpage)),
161 'description': clean_html(traverse_obj(re.findall(
162 r'<span(?:\s+class="cr-original-review-content")?>(.+?)</span>', review_body), -1)),
163 'uploader': clean_html(get_element_by_class('a-profile-name', webpage)),
164 'average_rating': float_or_none(clean_html(get_element_by_attribute(
165 'data-hook', 'review-star-rating', webpage) or '').partition(' ')[0]),
166 'thumbnail': self._search_regex(
167 r'data-thumbnail-url="([^"]+)"', review_body, 'thumbnail', default=None),
168 'formats': formats,
169 'subtitles': subtitles,
170 }