[yt-dlp.git] / youtube_dl / extractor / lrt.py

# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    determine_ext,
    int_or_none,
    parse_duration,
    remove_end,
)


class LRTIE(InfoExtractor):
    IE_NAME = 'lrt.lt'
    _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)'
    _TESTS = [{
        # m3u8 download
        'url': 'http://www.lrt.lt/mediateka/irasas/54391/',
        'md5': 'fe44cf7e4ab3198055f2c598fc175cb0',
        'info_dict': {
            'id': '54391',
            'ext': 'mp4',
            'title': 'Septynios Kauno dienos',
            'description': 'md5:24d84534c7dc76581e59f5689462411a',
            'duration': 1783,
            'view_count': int,
            'like_count': int,
        },
    }, {
        # direct mp3 download
        'url': 'http://www.lrt.lt/mediateka/irasas/1013074524/',
        'md5': '389da8ca3cad0f51d12bed0c844f6a0a',
        'info_dict': {
            'id': '1013074524',
            'ext': 'mp3',
            'title': 'Kita tema 2016-09-05 15:05',
            'description': 'md5:1b295a8fc7219ed0d543fc228c931fb5',
            'duration': 3008,
            'view_count': int,
            'like_count': int,
        },
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)

        title = remove_end(self._og_search_title(webpage), ' - LRT')

        formats = []
        for _, file_url in re.findall(
                r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage):
            ext = determine_ext(file_url)
            if ext not in ('m3u8', 'mp3'):
                continue
            # mp3 served as m3u8 produces stuttered media file
            if ext == 'm3u8' and '.mp3' in file_url:
                continue
            if ext == 'm3u8':
                formats.extend(self._extract_m3u8_formats(
                    file_url, video_id, 'mp4', entry_protocol='m3u8_native',
                    fatal=False))
            elif ext == 'mp3':
                formats.append({
                    'url': file_url,
                    'vcodec': 'none',
                })
        self._sort_formats(formats)

        thumbnail = self._og_search_thumbnail(webpage)
        description = self._og_search_description(webpage)
        duration = parse_duration(self._search_regex(
            r'var\s+record_len\s*=\s*(["\'])(?P<duration>[0-9]+:[0-9]+:[0-9]+)\1',
            webpage, 'duration', default=None, group='duration'))

        view_count = int_or_none(self._html_search_regex(
            r'<div[^>]+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P<count>.+?)</div>',
            webpage, 'view count', fatal=False, group='count'))
        like_count = int_or_none(self._search_regex(
            r'<span[^>]+id=(["\'])flikesCount.*?\1>(?P<count>\d+)<',
            webpage, 'like count', fatal=False, group='count'))

        return {
            'id': video_id,
            'title': title,
            'formats': formats,
            'thumbnail': thumbnail,
            'description': description,
            'duration': duration,
            'view_count': view_count,
            'like_count': like_count,
        }
Commit	Line	Data
4dc19c09 NJ	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
2512b174 S	4	import re
2512b174 S	5
4dc19c09 NJ	6	from .common import InfoExtractor
4dc19c09 NJ	7	from ..utils import (
2512b174	8	determine_ext,
15aad84d	9	int_or_none,
4dc19c09 NJ	10	parse_duration,
	11	remove_end,
	12	)
	13
	14
	15	class LRTIE(InfoExtractor):
	16	IE_NAME = 'lrt.lt'
	17	_VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)'
2512b174 S	18	_TESTS = [{
2512b174 S	19	# m3u8 download
4dc19c09	20	'url': 'http://www.lrt.lt/mediateka/irasas/54391/',
2512b174	21	'md5': 'fe44cf7e4ab3198055f2c598fc175cb0',
4dc19c09 NJ	22	'info_dict': {
	23	'id': '54391',
	24	'ext': 'mp4',
	25	'title': 'Septynios Kauno dienos',
9a76f416	26	'description': 'md5:24d84534c7dc76581e59f5689462411a',
4dc19c09	27	'duration': 1783,
15aad84d S	28	'view_count': int,
15aad84d S	29	'like_count': int,
4dc19c09	30	},
2512b174 S	31	}, {
	32	# direct mp3 download
	33	'url': 'http://www.lrt.lt/mediateka/irasas/1013074524/',
	34	'md5': '389da8ca3cad0f51d12bed0c844f6a0a',
	35	'info_dict': {
	36	'id': '1013074524',
	37	'ext': 'mp3',
	38	'title': 'Kita tema 2016-09-05 15:05',
	39	'description': 'md5:1b295a8fc7219ed0d543fc228c931fb5',
	40	'duration': 3008,
	41	'view_count': int,
	42	'like_count': int,
4dc19c09	43	},
2512b174	44	}]
4dc19c09 NJ	45
4dc19c09 NJ	46	def _real_extract(self, url):
8112d4b2	47	video_id = self._match_id(url)
4dc19c09 NJ	48	webpage = self._download_webpage(url, video_id)
	49
	50	title = remove_end(self._og_search_title(webpage), ' - LRT')
2512b174 S	51
	52	formats = []
	53	for _, file_url in re.findall(
	54	r'file\s:\s(["\'])(?P<url>(?:(?!\1).)+)\1', webpage):
	55	ext = determine_ext(file_url)
	56	if ext not in ('m3u8', 'mp3'):
	57	continue
	58	# mp3 served as m3u8 produces stuttered media file
	59	if ext == 'm3u8' and '.mp3' in file_url:
	60	continue
	61	if ext == 'm3u8':
	62	formats.extend(self._extract_m3u8_formats(
	63	file_url, video_id, 'mp4', entry_protocol='m3u8_native',
	64	fatal=False))
	65	elif ext == 'mp3':
	66	formats.append({
	67	'url': file_url,
	68	'vcodec': 'none',
	69	})
19dbaeec	70	self._sort_formats(formats)
f7e1d82d	71
4dc19c09 NJ	72	thumbnail = self._og_search_thumbnail(webpage)
	73	description = self._og_search_description(webpage)
	74	duration = parse_duration(self._search_regex(
f7e1d82d S	75	r'var\s+record_len\s=\s(["\'])(?P<duration>[0-9]+:[0-9]+:[0-9]+)\1',
f7e1d82d S	76	webpage, 'duration', default=None, group='duration'))
4dc19c09	77
15aad84d S	78	view_count = int_or_none(self._html_search_regex(
	79	r'<div[^>]+class=(["\']).?record-desc-seen.?\1[^>]*>(?P<count>.+?)</div>',
	80	webpage, 'view count', fatal=False, group='count'))
	81	like_count = int_or_none(self._search_regex(
	82	r'<span[^>]+id=(["\'])flikesCount.*?\1>(?P<count>\d+)<',
	83	webpage, 'like count', fatal=False, group='count'))
	84
4dc19c09 NJ	85	return {
	86	'id': video_id,
	87	'title': title,
	88	'formats': formats,
	89	'thumbnail': thumbnail,
	90	'description': description,
	91	'duration': duration,
15aad84d S	92	'view_count': view_count,
15aad84d S	93	'like_count': like_count,
4dc19c09	94	}