yt_dlp/extractor/ynet.py

   1 import re
   2 import json
   3
   4 from .common import InfoExtractor
   5 from ..compat import compat_urllib_parse_unquote_plus
   6
   7
   8 class YnetIE(InfoExtractor):
   9     _VALID_URL = r'https?://(?:.+?\.)?ynet\.co\.il/(?:.+?/)?0,7340,(?P<id>L(?:-[0-9]+)+),00\.html'
  10     _TESTS = [
  11         {
  12             'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html',
  13             'info_dict': {
  14                 'id': 'L-11659-99244',
  15                 'ext': 'flv',
  16                 'title': 'איש לא יודע מאיפה באנו',
  17                 'thumbnail': r're:^https?://.*\.jpg',
  18             }
  19         }, {
  20             'url': 'http://hot.ynet.co.il/home/0,7340,L-8859-84418,00.html',
  21             'info_dict': {
  22                 'id': 'L-8859-84418',
  23                 'ext': 'flv',
  24                 'title': "צפו: הנשיקה הלוהטת של תורגי' ויוליה פלוטקין",
  25                 'thumbnail': r're:^https?://.*\.jpg',
  26             }
  27         }
  28     ]
  29
  30     def _real_extract(self, url):
  31         video_id = self._match_id(url)
  32         webpage = self._download_webpage(url, video_id)
  33
  34         content = compat_urllib_parse_unquote_plus(self._og_search_video_url(webpage))
  35         config = json.loads(self._search_regex(r'config=({.+?})$', content, 'video config'))
  36         f4m_url = config['clip']['url']
  37         title = self._og_search_title(webpage)
  38         m = re.search(r'ynet - HOT -- (["\']+)(?P<title>.+?)\1', title)
  39         if m:
  40             title = m.group('title')
  41         formats = self._extract_f4m_formats(f4m_url, video_id)
  42         self._sort_formats(formats)
  43
  44         return {
  45             'id': video_id,
  46             'title': title,
  47             'formats': formats,
  48             'thumbnail': self._og_search_thumbnail(webpage),
  49         }