yt_dlp/extractor/amazon.py

   1 from .common import InfoExtractor
   2 from ..utils import int_or_none
   3
   4
   5 class AmazonStoreIE(InfoExtractor):
   6     _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/(?:[^/]+/)?(?:dp|gp/product)/(?P<id>[^/&#$?]+)'
   7
   8     _TESTS = [{
   9         'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/',
  10         'info_dict': {
  11             'id': 'B098XNCHLD',
  12             'title': 'md5:5f3194dbf75a8dcfc83079bd63a2abed',
  13         },
  14         'playlist_mincount': 1,
  15         'playlist': [{
  16             'info_dict': {
  17                 'id': 'A1F83G8C2ARO7P',
  18                 'ext': 'mp4',
  19                 'title': 'mcdodo usb c cable 100W 5a',
  20                 'thumbnail': r're:^https?://.*\.jpg$',
  21             },
  22         }]
  23     }, {
  24         'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3',
  25         'info_dict': {
  26             'id': 'B0863TXGM3',
  27             'title': 'md5:b0bde4881d3cfd40d63af19f7898b8ff',
  28         },
  29         'playlist_mincount': 4,
  30     }, {
  31         'url': 'https://www.amazon.com/dp/B0845NXCXF/',
  32         'info_dict': {
  33             'id': 'B0845NXCXF',
  34             'title': 'md5:2145cd4e3c7782f1ee73649a3cff1171',
  35         },
  36         'playlist-mincount': 1,
  37     }]
  38
  39     def _real_extract(self, url):
  40         id = self._match_id(url)
  41         webpage = self._download_webpage(url, id)
  42         data_json = self._parse_json(self._html_search_regex(r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'(.*)\'\)', webpage, 'data'), id)
  43         entries = [{
  44             'id': video['marketPlaceID'],
  45             'url': video['url'],
  46             'title': video.get('title'),
  47             'thumbnail': video.get('thumbUrl') or video.get('thumb'),
  48             'duration': video.get('durationSeconds'),
  49             'height': int_or_none(video.get('videoHeight')),
  50             'width': int_or_none(video.get('videoWidth')),
  51         } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')]
  52         return self.playlist_result(entries, playlist_id=id, playlist_title=data_json['title'])