[yt-dlp.git] / yt_dlp / extractor / amazon.py

import re

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
    clean_html,
    float_or_none,
    get_element_by_attribute,
    get_element_by_class,
    int_or_none,
    js_to_json,
    traverse_obj,
    url_or_none,
)


class AmazonStoreIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/(?:[^/]+/)?(?:dp|gp/product)/(?P<id>[^/&#$?]+)'

    _TESTS = [{
        'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/',
        'info_dict': {
            'id': 'B098XNCHLD',
            'title': str,
        },
        'playlist_mincount': 1,
        'playlist': [{
            'info_dict': {
                'id': 'A1F83G8C2ARO7P',
                'ext': 'mp4',
                'title': 'mcdodo usb c cable 100W 5a',
                'thumbnail': r're:^https?://.*\.jpg$',
                'duration': 34,
            },
        }],
        'expected_warnings': ['Unable to extract data'],
    }, {
        'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3',
        'info_dict': {
            'id': 'B0863TXGM3',
            'title': str,
        },
        'playlist_mincount': 4,
        'expected_warnings': ['Unable to extract data'],
    }, {
        'url': 'https://www.amazon.com/dp/B0845NXCXF/',
        'info_dict': {
            'id': 'B0845NXCXF',
            'title': str,
        },
        'playlist-mincount': 1,
        'expected_warnings': ['Unable to extract data'],
    }, {
        'url': 'https://www.amazon.es/Samsung-Smartphone-s-AMOLED-Quad-c%C3%A1mara-espa%C3%B1ola/dp/B08WX337PQ',
        'info_dict': {
            'id': 'B08WX337PQ',
            'title': str,
        },
        'playlist_mincount': 1,
        'expected_warnings': ['Unable to extract data'],
    }]

    def _real_extract(self, url):
        id = self._match_id(url)

        for retry in self.RetryManager():
            webpage = self._download_webpage(url, id)
            try:
                data_json = self._search_json(
                    r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'', webpage, 'data', id,
                    transform_source=js_to_json)
            except ExtractorError as e:
                retry.error = e

        entries = [{
            'id': video['marketPlaceID'],
            'url': video['url'],
            'title': video.get('title'),
            'thumbnail': video.get('thumbUrl') or video.get('thumb'),
            'duration': video.get('durationSeconds'),
            'height': int_or_none(video.get('videoHeight')),
            'width': int_or_none(video.get('videoWidth')),
        } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')]
        return self.playlist_result(entries, playlist_id=id, playlist_title=data_json.get('title'))


class AmazonReviewsIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/gp/customer-reviews/(?P<id>[^/&#$?]+)'
    _TESTS = [{
        'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl',
        'info_dict': {
            'id': 'R10VE9VUSY19L3',
            'ext': 'mp4',
            'title': 'Get squad #Suspicious',
            'description': 'md5:7012695052f440a1e064e402d87e0afb',
            'uploader': 'Kimberly Cronkright',
            'average_rating': 1.0,
            'thumbnail': r're:^https?://.*\.jpg$',
        },
        'expected_warnings': ['Review body was not found in webpage'],
    }, {
        'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl?language=es_US',
        'info_dict': {
            'id': 'R10VE9VUSY19L3',
            'ext': 'mp4',
            'title': 'Get squad #Suspicious',
            'description': 'md5:7012695052f440a1e064e402d87e0afb',
            'uploader': 'Kimberly Cronkright',
            'average_rating': 1.0,
            'thumbnail': r're:^https?://.*\.jpg$',
        },
        'expected_warnings': ['Review body was not found in webpage'],
    }, {
        'url': 'https://www.amazon.in/gp/customer-reviews/RV1CO8JN5VGXV/',
        'info_dict': {
            'id': 'RV1CO8JN5VGXV',
            'ext': 'mp4',
            'title': 'Not sure about its durability',
            'description': 'md5:1a252c106357f0a3109ebf37d2e87494',
            'uploader': 'Shoaib Gulzar',
            'average_rating': 2.0,
            'thumbnail': r're:^https?://.*\.jpg$',
        },
        'expected_warnings': ['Review body was not found in webpage'],
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        for retry in self.RetryManager():
            webpage = self._download_webpage(url, video_id)
            review_body = get_element_by_attribute('data-hook', 'review-body', webpage)
            if not review_body:
                retry.error = ExtractorError('Review body was not found in webpage', expected=True)

        formats, subtitles = [], {}

        manifest_url = self._search_regex(
            r'data-video-url="([^"]+)"', review_body, 'm3u8 url', default=None)
        if url_or_none(manifest_url):
            fmts, subtitles = self._extract_m3u8_formats_and_subtitles(
                manifest_url, video_id, 'mp4', fatal=False)
            formats.extend(fmts)

        video_url = self._search_regex(
            r'<input[^>]+\bvalue="([^"]+)"[^>]+\bclass="video-url"', review_body, 'mp4 url', default=None)
        if url_or_none(video_url):
            formats.append({
                'url': video_url,
                'ext': 'mp4',
                'format_id': 'http-mp4',
            })

        if not formats:
            self.raise_no_formats('No video found for this customer review', expected=True)

        return {
            'id': video_id,
            'title': (clean_html(get_element_by_attribute('data-hook', 'review-title', webpage))
                      or self._html_extract_title(webpage)),
            'description': clean_html(traverse_obj(re.findall(
                r'<span(?:\s+class="cr-original-review-content")?>(.+?)</span>', review_body), -1)),
            'uploader': clean_html(get_element_by_class('a-profile-name', webpage)),
            'average_rating': float_or_none(clean_html(get_element_by_attribute(
                'data-hook', 'review-star-rating', webpage) or '').partition(' ')[0]),
            'thumbnail': self._search_regex(
                r'data-thumbnail-url="([^"]+)"', review_body, 'thumbnail', default=None),
            'formats': formats,
            'subtitles': subtitles,
        }
Commit	Line	Data
53006b35	1	import re
53006b35	2
3c4eebf7	3	from .common import InfoExtractor
53006b35	4	from ..utils import (
	5	ExtractorError,
	6	clean_html,
	7	float_or_none,
	8	get_element_by_attribute,
	9	get_element_by_class,
	10	int_or_none,
	11	js_to_json,
	12	traverse_obj,
	13	url_or_none,
	14	)
3c4eebf7	15
	16
	17	class AmazonStoreIE(InfoExtractor):
73f035e1	18	_VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/(?:[^/]+/)?(?:dp\|gp/product)/(?P<id>[^/&#$?]+)'
3c4eebf7	19
	20	_TESTS = [{
	21	'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/',
	22	'info_dict': {
	23	'id': 'B098XNCHLD',
53006b35	24	'title': str,
3c4eebf7	25	},
	26	'playlist_mincount': 1,
	27	'playlist': [{
	28	'info_dict': {
	29	'id': 'A1F83G8C2ARO7P',
	30	'ext': 'mp4',
	31	'title': 'mcdodo usb c cable 100W 5a',
	32	'thumbnail': r're:^https?://.*\.jpg$',
7474e453	33	'duration': 34,
3c4eebf7	34	},
53006b35	35	}],
53006b35	36	'expected_warnings': ['Unable to extract data'],
3c4eebf7	37	}, {
	38	'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3',
	39	'info_dict': {
	40	'id': 'B0863TXGM3',
53006b35	41	'title': str,
3c4eebf7	42	},
3c4eebf7	43	'playlist_mincount': 4,
53006b35	44	'expected_warnings': ['Unable to extract data'],
3c4eebf7	45	}, {
	46	'url': 'https://www.amazon.com/dp/B0845NXCXF/',
	47	'info_dict': {
	48	'id': 'B0845NXCXF',
53006b35	49	'title': str,
3c4eebf7	50	},
3c4eebf7	51	'playlist-mincount': 1,
53006b35	52	'expected_warnings': ['Unable to extract data'],
7474e453	53	}, {
	54	'url': 'https://www.amazon.es/Samsung-Smartphone-s-AMOLED-Quad-c%C3%A1mara-espa%C3%B1ola/dp/B08WX337PQ',
	55	'info_dict': {
	56	'id': 'B08WX337PQ',
53006b35	57	'title': str,
7474e453	58	},
7474e453	59	'playlist_mincount': 1,
53006b35	60	'expected_warnings': ['Unable to extract data'],
3c4eebf7	61	}]
	62
	63	def _real_extract(self, url):
	64	id = self._match_id(url)
3c7a2762	65
8ca48a1a	66	for retry in self.RetryManager():
3c7a2762 L	67	webpage = self._download_webpage(url, id)
3c7a2762 L	68	try:
7474e453	69	data_json = self._search_json(
7474e453	70	r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'', webpage, 'data', id,
53006b35	71	transform_source=js_to_json)
3c7a2762 L	72	except ExtractorError as e:
	73	retry.error = e
	74
3c4eebf7	75	entries = [{
	76	'id': video['marketPlaceID'],
	77	'url': video['url'],
	78	'title': video.get('title'),
	79	'thumbnail': video.get('thumbUrl') or video.get('thumb'),
	80	'duration': video.get('durationSeconds'),
	81	'height': int_or_none(video.get('videoHeight')),
	82	'width': int_or_none(video.get('videoWidth')),
	83	} for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')]
7474e453	84	return self.playlist_result(entries, playlist_id=id, playlist_title=data_json.get('title'))
53006b35	85
	86
	87	class AmazonReviewsIE(InfoExtractor):
	88	_VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/gp/customer-reviews/(?P<id>[^/&#$?]+)'
	89	_TESTS = [{
	90	'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl',
	91	'info_dict': {
	92	'id': 'R10VE9VUSY19L3',
	93	'ext': 'mp4',
	94	'title': 'Get squad #Suspicious',
	95	'description': 'md5:7012695052f440a1e064e402d87e0afb',
	96	'uploader': 'Kimberly Cronkright',
	97	'average_rating': 1.0,
	98	'thumbnail': r're:^https?://.*\.jpg$',
	99	},
	100	'expected_warnings': ['Review body was not found in webpage'],
	101	}, {
	102	'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl?language=es_US',
	103	'info_dict': {
	104	'id': 'R10VE9VUSY19L3',
	105	'ext': 'mp4',
	106	'title': 'Get squad #Suspicious',
	107	'description': 'md5:7012695052f440a1e064e402d87e0afb',
	108	'uploader': 'Kimberly Cronkright',
	109	'average_rating': 1.0,
	110	'thumbnail': r're:^https?://.*\.jpg$',
	111	},
	112	'expected_warnings': ['Review body was not found in webpage'],
	113	}, {
	114	'url': 'https://www.amazon.in/gp/customer-reviews/RV1CO8JN5VGXV/',
	115	'info_dict': {
	116	'id': 'RV1CO8JN5VGXV',
	117	'ext': 'mp4',
	118	'title': 'Not sure about its durability',
	119	'description': 'md5:1a252c106357f0a3109ebf37d2e87494',
	120	'uploader': 'Shoaib Gulzar',
	121	'average_rating': 2.0,
	122	'thumbnail': r're:^https?://.*\.jpg$',
	123	},
	124	'expected_warnings': ['Review body was not found in webpage'],
	125	}]
	126
	127	def _real_extract(self, url):
	128	video_id = self._match_id(url)
	129
	130	for retry in self.RetryManager():
	131	webpage = self._download_webpage(url, video_id)
	132	review_body = get_element_by_attribute('data-hook', 'review-body', webpage)
	133	if not review_body:
	134	retry.error = ExtractorError('Review body was not found in webpage', expected=True)
	135
	136	formats, subtitles = [], {}
	137
	138	manifest_url = self._search_regex(
	139	r'data-video-url="([^"]+)"', review_body, 'm3u8 url', default=None)
	140	if url_or_none(manifest_url):
	141	fmts, subtitles = self._extract_m3u8_formats_and_subtitles(
	142	manifest_url, video_id, 'mp4', fatal=False)
	143	formats.extend(fmts)
	144
	145	video_url = self._search_regex(
	146	r'<input[^>]+\bvalue="([^"]+)"[^>]+\bclass="video-url"', review_body, 'mp4 url', default=None)
	147	if url_or_none(video_url):
	148	formats.append({
149	'url': video_url,
150	'ext': 'mp4',
151	'format_id': 'http-mp4',
152	})
153
154	if not formats:
155	self.raise_no_formats('No video found for this customer review', expected=True)
156
157	return {
158	'id': video_id,
159	'title': (clean_html(get_element_by_attribute('data-hook', 'review-title', webpage))
160	or self._html_extract_title(webpage)),
161	'description': clean_html(traverse_obj(re.findall(
162	r'<span(?:\s+class="cr-original-review-content")?>(.+?)</span>', review_body), -1)),
163	'uploader': clean_html(get_element_by_class('a-profile-name', webpage)),
164	'average_rating': float_or_none(clean_html(get_element_by_attribute(
165	'data-hook', 'review-star-rating', webpage) or '').partition(' ')[0]),
166	'thumbnail': self._search_regex(
167	r'data-thumbnail-url="([^"]+)"', review_body, 'thumbnail', default=None),
168	'formats': formats,
169	'subtitles': subtitles,
170	}