[yt-dlp.git] / yt_dlp / downloader / mhtml.py

import io
import quopri
import re
import uuid

from .fragment import FragmentFD
from ..utils import escapeHTML, formatSeconds, srt_subtitles_timecode, urljoin
from ..version import __version__ as YT_DLP_VERSION


class MhtmlFD(FragmentFD):
    FD_NAME = 'mhtml'

    _STYLESHEET = """\
html, body {
    margin: 0;
    padding: 0;
    height: 100vh;
}

html {
    overflow-y: scroll;
    scroll-snap-type: y mandatory;
}

body {
    scroll-snap-type: y mandatory;
    display: flex;
    flex-flow: column;
}

body > figure {
    max-width: 100vw;
    max-height: 100vh;
    scroll-snap-align: center;
}

body > figure > figcaption {
    text-align: center;
    height: 2.5em;
}

body > figure > img {
    display: block;
    margin: auto;
    max-width: 100%;
    max-height: calc(100vh - 5em);
}
"""
    _STYLESHEET = re.sub(r'\s+', ' ', _STYLESHEET)
    _STYLESHEET = re.sub(r'\B \B|(?<=[\w\-]) (?=[^\w\-])|(?<=[^\w\-]) (?=[\w\-])', '', _STYLESHEET)

    @staticmethod
    def _escape_mime(s):
        return '=?utf-8?Q?' + (b''.join(
            bytes((b,)) if b >= 0x20 else b'=%02X' % b
            for b in quopri.encodestring(s.encode(), header=True)
        )).decode('us-ascii') + '?='

    def _gen_cid(self, i, fragment, frag_boundary):
        return '%u.%s@yt-dlp.github.io.invalid' % (i, frag_boundary)

    def _gen_stub(self, *, fragments, frag_boundary, title):
        output = io.StringIO()

        output.write((
            '<!DOCTYPE html>'
            '<html>'
            '<head>'
            ''  '<meta name="generator" content="yt-dlp {version}">'
            ''  '<title>{title}</title>'
            ''  '<style>{styles}</style>'
            '<body>'
        ).format(
            version=escapeHTML(YT_DLP_VERSION),
            styles=self._STYLESHEET,
            title=escapeHTML(title)
        ))

        t0 = 0
        for i, frag in enumerate(fragments):
            output.write('<figure>')
            try:
                t1 = t0 + frag['duration']
                output.write((
                    '<figcaption>Slide #{num}: {t0} – {t1} (duration: {duration})</figcaption>'
                ).format(
                    num=i + 1,
                    t0=srt_subtitles_timecode(t0),
                    t1=srt_subtitles_timecode(t1),
                    duration=formatSeconds(frag['duration'], msec=True)
                ))
            except (KeyError, ValueError, TypeError):
                t1 = None
                output.write((
                    '<figcaption>Slide #{num}</figcaption>'
                ).format(num=i + 1))
            output.write('<img src="cid:{cid}">'.format(
                cid=self._gen_cid(i, frag, frag_boundary)))
            output.write('</figure>')
            t0 = t1

        return output.getvalue()

    def real_download(self, filename, info_dict):
        fragment_base_url = info_dict.get('fragment_base_url')
        fragments = info_dict['fragments'][:1] if self.params.get(
            'test', False) else info_dict['fragments']
        title = info_dict.get('title', info_dict['format_id'])
        origin = info_dict.get('webpage_url', info_dict['url'])

        ctx = {
            'filename': filename,
            'total_frags': len(fragments),
        }

        self._prepare_and_start_frag_download(ctx, info_dict)

        extra_state = ctx.setdefault('extra_state', {
            'header_written': False,
            'mime_boundary': str(uuid.uuid4()).replace('-', ''),
        })

        frag_boundary = extra_state['mime_boundary']

        if not extra_state['header_written']:
            stub = self._gen_stub(
                fragments=fragments,
                frag_boundary=frag_boundary,
                title=title
            )

            ctx['dest_stream'].write((
                'MIME-Version: 1.0\r\n'
                'From: <nowhere@yt-dlp.github.io.invalid>\r\n'
                'To: <nowhere@yt-dlp.github.io.invalid>\r\n'
                'Subject: {title}\r\n'
                'Content-type: multipart/related; '
                ''  'boundary="{boundary}"; '
                ''  'type="text/html"\r\n'
                'X.yt-dlp.Origin: {origin}\r\n'
                '\r\n'
                '--{boundary}\r\n'
                'Content-Type: text/html; charset=utf-8\r\n'
                'Content-Length: {length}\r\n'
                '\r\n'
                '{stub}\r\n'
            ).format(
                origin=origin,
                boundary=frag_boundary,
                length=len(stub),
                title=self._escape_mime(title),
                stub=stub
            ).encode())
            extra_state['header_written'] = True

        for i, fragment in enumerate(fragments):
            if (i + 1) <= ctx['fragment_index']:
                continue

            fragment_url = fragment.get('url')
            if not fragment_url:
                assert fragment_base_url
                fragment_url = urljoin(fragment_base_url, fragment['path'])

            success = self._download_fragment(ctx, fragment_url, info_dict)
            if not success:
                continue
            frag_content = self._read_fragment(ctx)

            mime_type = b'image/jpeg'
            if frag_content.startswith(b'\x89PNG\r\n\x1a\n'):
                mime_type = b'image/png'
            if frag_content.startswith((b'GIF87a', b'GIF89a')):
                mime_type = b'image/gif'
            if frag_content.startswith(b'RIFF') and frag_content[8:12] == b'WEBP':
                mime_type = b'image/webp'

            frag_header = io.BytesIO()
            frag_header.write(
                b'--%b\r\n' % frag_boundary.encode('us-ascii'))
            frag_header.write(
                b'Content-ID: <%b>\r\n' % self._gen_cid(i, fragment, frag_boundary).encode('us-ascii'))
            frag_header.write(
                b'Content-type: %b\r\n' % mime_type)
            frag_header.write(
                b'Content-length: %u\r\n' % len(frag_content))
            frag_header.write(
                b'Content-location: %b\r\n' % fragment_url.encode('us-ascii'))
            frag_header.write(
                b'X.yt-dlp.Duration: %f\r\n' % fragment['duration'])
            frag_header.write(b'\r\n')
            self._append_fragment(
                ctx, frag_header.getvalue() + frag_content + b'\r\n')

        ctx['dest_stream'].write(
            b'--%b--\r\n\r\n' % frag_boundary.encode('us-ascii'))
        self._finish_frag_download(ctx, info_dict)
        return True
Commit	Line	Data
cdb19aa4	1	import io
	2	import quopri
	3	import re
	4	import uuid
	5
	6	from .fragment import FragmentFD
f8271158	7	from ..utils import escapeHTML, formatSeconds, srt_subtitles_timecode, urljoin
cdb19aa4	8	from ..version import __version__ as YT_DLP_VERSION
	9
	10
	11	class MhtmlFD(FragmentFD):
	12	FD_NAME = 'mhtml'
	13
	14	_STYLESHEET = """\
	15	html, body {
	16	margin: 0;
	17	padding: 0;
	18	height: 100vh;
	19	}
	20
	21	html {
	22	overflow-y: scroll;
	23	scroll-snap-type: y mandatory;
	24	}
	25
	26	body {
	27	scroll-snap-type: y mandatory;
	28	display: flex;
	29	flex-flow: column;
	30	}
	31
	32	body > figure {
	33	max-width: 100vw;
	34	max-height: 100vh;
	35	scroll-snap-align: center;
	36	}
	37
	38	body > figure > figcaption {
	39	text-align: center;
	40	height: 2.5em;
	41	}
	42
	43	body > figure > img {
	44	display: block;
	45	margin: auto;
	46	max-width: 100%;
	47	max-height: calc(100vh - 5em);
	48	}
	49	"""
	50	_STYLESHEET = re.sub(r'\s+', ' ', _STYLESHEET)
	51	_STYLESHEET = re.sub(r'\B \B\|(?<=[\w\-]) (?=[^\w\-])\|(?<=[^\w\-]) (?=[\w\-])', '', _STYLESHEET)
	52
	53	@staticmethod
	54	def _escape_mime(s):
	55	return '=?utf-8?Q?' + (b''.join(
	56	bytes((b,)) if b >= 0x20 else b'=%02X' % b
0f06bcd7	57	for b in quopri.encodestring(s.encode(), header=True)
cdb19aa4	58	)).decode('us-ascii') + '?='
	59
	60	def _gen_cid(self, i, fragment, frag_boundary):
	61	return '%u.%s@yt-dlp.github.io.invalid' % (i, frag_boundary)
	62
	63	def _gen_stub(self, *, fragments, frag_boundary, title):
	64	output = io.StringIO()
	65
	66	output.write((
	67	'<!DOCTYPE html>'
	68	'<html>'
	69	'<head>'
	70	'' '<meta name="generator" content="yt-dlp {version}">'
	71	'' '<title>{title}</title>'
	72	'' '<style>{styles}</style>'
	73	'<body>'
	74	).format(
	75	version=escapeHTML(YT_DLP_VERSION),
	76	styles=self._STYLESHEET,
	77	title=escapeHTML(title)
	78	))
	79
	80	t0 = 0
	81	for i, frag in enumerate(fragments):
	82	output.write('<figure>')
	83	try:
	84	t1 = t0 + frag['duration']
	85	output.write((
	86	'<figcaption>Slide #{num}: {t0} – {t1} (duration: {duration})</figcaption>'
	87	).format(
	88	num=i + 1,
	89	t0=srt_subtitles_timecode(t0),
	90	t1=srt_subtitles_timecode(t1),
	91	duration=formatSeconds(frag['duration'], msec=True)
	92	))
	93	except (KeyError, ValueError, TypeError):
	94	t1 = None
	95	output.write((
	96	'<figcaption>Slide #{num}</figcaption>'
	97	).format(num=i + 1))
	98	output.write('<img src="cid:{cid}">'.format(
	99	cid=self._gen_cid(i, frag, frag_boundary)))
	100	output.write('</figure>')
	101	t0 = t1
	102
	103	return output.getvalue()
	104
	105	def real_download(self, filename, info_dict):
	106	fragment_base_url = info_dict.get('fragment_base_url')
	107	fragments = info_dict['fragments'][:1] if self.params.get(
	108	'test', False) else info_dict['fragments']
d76991ab	109	title = info_dict.get('title', info_dict['format_id'])
d76991ab	110	origin = info_dict.get('webpage_url', info_dict['url'])
cdb19aa4	111
	112	ctx = {
	113	'filename': filename,
	114	'total_frags': len(fragments),
	115	}
	116
3ba7740d	117	self._prepare_and_start_frag_download(ctx, info_dict)
cdb19aa4	118
	119	extra_state = ctx.setdefault('extra_state', {
	120	'header_written': False,
	121	'mime_boundary': str(uuid.uuid4()).replace('-', ''),
	122	})
	123
	124	frag_boundary = extra_state['mime_boundary']
	125
	126	if not extra_state['header_written']:
	127	stub = self._gen_stub(
	128	fragments=fragments,
	129	frag_boundary=frag_boundary,
	130	title=title
	131	)
	132
	133	ctx['dest_stream'].write((
	134	'MIME-Version: 1.0\r\n'
	135	'From: <nowhere@yt-dlp.github.io.invalid>\r\n'
	136	'To: <nowhere@yt-dlp.github.io.invalid>\r\n'
	137	'Subject: {title}\r\n'
	138	'Content-type: multipart/related; '
	139	'' 'boundary="{boundary}"; '
	140	'' 'type="text/html"\r\n'
	141	'X.yt-dlp.Origin: {origin}\r\n'
	142	'\r\n'
	143	'--{boundary}\r\n'
	144	'Content-Type: text/html; charset=utf-8\r\n'
	145	'Content-Length: {length}\r\n'
	146	'\r\n'
	147	'{stub}\r\n'
	148	).format(
	149	origin=origin,
	150	boundary=frag_boundary,
	151	length=len(stub),
	152	title=self._escape_mime(title),
	153	stub=stub
0f06bcd7	154	).encode())
cdb19aa4	155	extra_state['header_written'] = True
	156
	157	for i, fragment in enumerate(fragments):
	158	if (i + 1) <= ctx['fragment_index']:
	159	continue
	160
b3edc806	161	fragment_url = fragment.get('url')
	162	if not fragment_url:
	163	assert fragment_base_url
	164	fragment_url = urljoin(fragment_base_url, fragment['path'])
	165
d71fd412	166	success = self._download_fragment(ctx, fragment_url, info_dict)
cdb19aa4	167	if not success:
cdb19aa4	168	continue
d71fd412	169	frag_content = self._read_fragment(ctx)
cdb19aa4	170
	171	mime_type = b'image/jpeg'
	172	if frag_content.startswith(b'\x89PNG\r\n\x1a\n'):
	173	mime_type = b'image/png'
	174	if frag_content.startswith((b'GIF87a', b'GIF89a')):
	175	mime_type = b'image/gif'
1d485a1a	176	if frag_content.startswith(b'RIFF') and frag_content[8:12] == b'WEBP':
cdb19aa4	177	mime_type = b'image/webp'
	178
	179	frag_header = io.BytesIO()
	180	frag_header.write(
	181	b'--%b\r\n' % frag_boundary.encode('us-ascii'))
	182	frag_header.write(
	183	b'Content-ID: <%b>\r\n' % self._gen_cid(i, fragment, frag_boundary).encode('us-ascii'))
	184	frag_header.write(
	185	b'Content-type: %b\r\n' % mime_type)
	186	frag_header.write(
	187	b'Content-length: %u\r\n' % len(frag_content))
	188	frag_header.write(
	189	b'Content-location: %b\r\n' % fragment_url.encode('us-ascii'))
	190	frag_header.write(
	191	b'X.yt-dlp.Duration: %f\r\n' % fragment['duration'])
	192	frag_header.write(b'\r\n')
	193	self._append_fragment(
	194	ctx, frag_header.getvalue() + frag_content + b'\r\n')
	195
	196	ctx['dest_stream'].write(
	197	b'--%b--\r\n\r\n' % frag_boundary.encode('us-ascii'))
3ba7740d	198	self._finish_frag_download(ctx, info_dict)
cdb19aa4	199	return True