[yt-dlp.git] / yt_dlp / extractor / joj.py

# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
    int_or_none,
    js_to_json,
    try_get,
)


class JojIE(InfoExtractor):
    _VALID_URL = r'''(?x)
                    (?:
                        joj:|
                        https?://media\.joj\.sk/embed/
                    )
                    (?P<id>[^/?#^]+)
                '''
    _TESTS = [{
        'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932',
        'info_dict': {
            'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932',
            'ext': 'mp4',
            'title': 'NOVÉ BÝVANIE',
            'thumbnail': r're:^https?://.*\.jpg$',
            'duration': 3118,
        }
    }, {
        'url': 'https://media.joj.sk/embed/9i1cxv',
        'only_matching': True,
    }, {
        'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932',
        'only_matching': True,
    }, {
        'url': 'joj:9i1cxv',
        'only_matching': True,
    }]

    @staticmethod
    def _extract_urls(webpage):
        return [
            mobj.group('url')
            for mobj in re.finditer(
                r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1',
                webpage)]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(
            'https://media.joj.sk/embed/%s' % video_id, video_id)

        title = self._search_regex(
            (r'videoTitle\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
             r'<title>(?P<title>[^<]+)'), webpage, 'title',
            default=None, group='title') or self._og_search_title(webpage)

        bitrates = self._parse_json(
            self._search_regex(
                r'(?s)(?:src|bitrates)\s*=\s*({.+?});', webpage, 'bitrates',
                default='{}'),
            video_id, transform_source=js_to_json, fatal=False)

        formats = []
        for format_url in try_get(bitrates, lambda x: x['mp4'], list) or []:
            if isinstance(format_url, compat_str):
                height = self._search_regex(
                    r'(\d+)[pP]\.', format_url, 'height', default=None)
                formats.append({
                    'url': format_url,
                    'format_id': '%sp' % height if height else None,
                    'height': int(height),
                })
        if not formats:
            playlist = self._download_xml(
                'https://media.joj.sk/services/Video.php?clip=%s' % video_id,
                video_id)
            for file_el in playlist.findall('./files/file'):
                path = file_el.get('path')
                if not path:
                    continue
                format_id = file_el.get('id') or file_el.get('label')
                formats.append({
                    'url': 'http://n16.joj.sk/storage/%s' % path.replace(
                        'dat/', '', 1),
                    'format_id': format_id,
                    'height': int_or_none(self._search_regex(
                        r'(\d+)[pP]', format_id or path, 'height',
                        default=None)),
                })
        self._sort_formats(formats)

        thumbnail = self._og_search_thumbnail(webpage)

        duration = int_or_none(self._search_regex(
            r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))

        return {
            'id': video_id,
            'title': title,
            'thumbnail': thumbnail,
            'duration': duration,
            'formats': formats,
        }
Commit	Line	Data
cefecac1 U	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
	4	import re
	5
	6	from .common import InfoExtractor
	7	from ..compat import compat_str
	8	from ..utils import (
	9	int_or_none,
	10	js_to_json,
	11	try_get,
	12	)
	13
	14
	15	class JojIE(InfoExtractor):
	16	_VALID_URL = r'''(?x)
	17	(?:
	18	joj:\|
	19	https?://media\.joj\.sk/embed/
	20	)
	21	(?P<id>[^/?#^]+)
	22	'''
	23	_TESTS = [{
	24	'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932',
	25	'info_dict': {
	26	'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932',
	27	'ext': 'mp4',
	28	'title': 'NOVÉ BÝVANIE',
	29	'thumbnail': r're:^https?://.*\.jpg$',
	30	'duration': 3118,
	31	}
	32	}, {
	33	'url': 'https://media.joj.sk/embed/9i1cxv',
	34	'only_matching': True,
	35	}, {
	36	'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932',
	37	'only_matching': True,
	38	}, {
	39	'url': 'joj:9i1cxv',
	40	'only_matching': True,
	41	}]
	42
	43	@staticmethod
	44	def _extract_urls(webpage):
	45	return [
	46	mobj.group('url')
	47	for mobj in re.finditer(
	48	r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1',
	49	webpage)]
	50
	51	def _real_extract(self, url):
	52	video_id = self._match_id(url)
	53
	54	webpage = self._download_webpage(
	55	'https://media.joj.sk/embed/%s' % video_id, video_id)
	56
	57	title = self._search_regex(
	58	(r'videoTitle\s:\s(["\'])(?P<title>(?:(?!\1).)+)\1',
	59	r'<title>(?P<title>[^<]+)'), webpage, 'title',
	60	default=None, group='title') or self._og_search_title(webpage)
	61
	62	bitrates = self._parse_json(
	63	self._search_regex(
	64	r'(?s)(?:src\|bitrates)\s=\s({.+?});', webpage, 'bitrates',
65	default='{}'),
66	video_id, transform_source=js_to_json, fatal=False)
67
68	formats = []
69	for format_url in try_get(bitrates, lambda x: x['mp4'], list) or []:
70	if isinstance(format_url, compat_str):
71	height = self._search_regex(
72	r'(\d+)[pP]\.', format_url, 'height', default=None)
73	formats.append({
74	'url': format_url,
75	'format_id': '%sp' % height if height else None,
76	'height': int(height),
77	})
78	if not formats:
79	playlist = self._download_xml(
80	'https://media.joj.sk/services/Video.php?clip=%s' % video_id,
81	video_id)
82	for file_el in playlist.findall('./files/file'):
83	path = file_el.get('path')
84	if not path:
85	continue
86	format_id = file_el.get('id') or file_el.get('label')
87	formats.append({
88	'url': 'http://n16.joj.sk/storage/%s' % path.replace(
89	'dat/', '', 1),
90	'format_id': format_id,
91	'height': int_or_none(self._search_regex(
92	r'(\d+)[pP]', format_id or path, 'height',
93	default=None)),
94	})
95	self._sort_formats(formats)
96
97	thumbnail = self._og_search_thumbnail(webpage)
98
99	duration = int_or_none(self._search_regex(
100	r'videoDuration\s:\s(\d+)', webpage, 'duration', fatal=False))
101
102	return {
103	'id': video_id,
104	'title': title,
105	'thumbnail': thumbnail,
106	'duration': duration,
107	'formats': formats,
108	}