[yt-dlp.git] / devscripts / tomlparse.py

#!/usr/bin/env python3

"""
Simple parser for spec compliant toml files

A simple toml parser for files that comply with the spec.
Should only be used to parse `pyproject.toml` for `install_deps.py`.

IMPORTANT: INVALID FILES OR MULTILINE STRINGS ARE NOT SUPPORTED!
"""

from __future__ import annotations

import datetime
import json
import re

WS = r'(?:[\ \t]*)'
STRING_RE = re.compile(r'"(?:\\.|[^\\"\n])*"|\'[^\'\n]*\'')
SINGLE_KEY_RE = re.compile(rf'{STRING_RE.pattern}|[A-Za-z0-9_-]+')
KEY_RE = re.compile(rf'{WS}(?:{SINGLE_KEY_RE.pattern}){WS}(?:\.{WS}(?:{SINGLE_KEY_RE.pattern}){WS})*')
EQUALS_RE = re.compile(rf'={WS}')
WS_RE = re.compile(WS)

_SUBTABLE = rf'(?P<subtable>^\[(?P<is_list>\[)?(?P<path>{KEY_RE.pattern})\]\]?)'
EXPRESSION_RE = re.compile(rf'^(?:{_SUBTABLE}|{KEY_RE.pattern}=)', re.MULTILINE)

LIST_WS_RE = re.compile(rf'{WS}((#[^\n]*)?\n{WS})*')
LEFTOVER_VALUE_RE = re.compile(r'[^,}\]\t\n#]+')


def parse_key(value: str):
    for match in SINGLE_KEY_RE.finditer(value):
        if match[0][0] == '"':
            yield json.loads(match[0])
        elif match[0][0] == '\'':
            yield match[0][1:-1]
        else:
            yield match[0]


def get_target(root: dict, paths: list[str], is_list=False):
    target = root

    for index, key in enumerate(paths, 1):
        use_list = is_list and index == len(paths)
        result = target.get(key)
        if result is None:
            result = [] if use_list else {}
            target[key] = result

        if isinstance(result, dict):
            target = result
        elif use_list:
            target = {}
            result.append(target)
        else:
            target = result[-1]

    assert isinstance(target, dict)
    return target


def parse_enclosed(data: str, index: int, end: str, ws_re: re.Pattern):
    index += 1

    if match := ws_re.match(data, index):
        index = match.end()

    while data[index] != end:
        index = yield True, index

        if match := ws_re.match(data, index):
            index = match.end()

        if data[index] == ',':
            index += 1

        if match := ws_re.match(data, index):
            index = match.end()

    assert data[index] == end
    yield False, index + 1


def parse_value(data: str, index: int):
    if data[index] == '[':
        result = []

        indices = parse_enclosed(data, index, ']', LIST_WS_RE)
        valid, index = next(indices)
        while valid:
            index, value = parse_value(data, index)
            result.append(value)
            valid, index = indices.send(index)

        return index, result

    if data[index] == '{':
        result = {}

        indices = parse_enclosed(data, index, '}', WS_RE)
        valid, index = next(indices)
        while valid:
            valid, index = indices.send(parse_kv_pair(data, index, result))

        return index, result

    if match := STRING_RE.match(data, index):
        return match.end(), json.loads(match[0]) if match[0][0] == '"' else match[0][1:-1]

    match = LEFTOVER_VALUE_RE.match(data, index)
    assert match
    value = match[0].strip()
    for func in [
        int,
        float,
        datetime.time.fromisoformat,
        datetime.date.fromisoformat,
        datetime.datetime.fromisoformat,
        {'true': True, 'false': False}.get,
    ]:
        try:
            value = func(value)
            break
        except Exception:
            pass

    return match.end(), value


def parse_kv_pair(data: str, index: int, target: dict):
    match = KEY_RE.match(data, index)
    if not match:
        return None

    *keys, key = parse_key(match[0])

    match = EQUALS_RE.match(data, match.end())
    assert match
    index = match.end()

    index, value = parse_value(data, index)
    get_target(target, keys)[key] = value
    return index


def parse_toml(data: str):
    root = {}
    target = root

    index = 0
    while True:
        match = EXPRESSION_RE.search(data, index)
        if not match:
            break

        if match.group('subtable'):
            index = match.end()
            path, is_list = match.group('path', 'is_list')
            target = get_target(root, list(parse_key(path)), bool(is_list))
            continue

        index = parse_kv_pair(data, match.start(), target)
        assert index is not None

    return root


def main():
    import argparse
    from pathlib import Path

    parser = argparse.ArgumentParser()
    parser.add_argument('infile', type=Path, help='The TOML file to read as input')
    args = parser.parse_args()

    with args.infile.open('r', encoding='utf-8') as file:
        data = file.read()

    def default(obj):
        if isinstance(obj, (datetime.date, datetime.time, datetime.datetime)):
            return obj.isoformat()

    print(json.dumps(parse_toml(data), default=default))


if __name__ == '__main__':
    main()
Commit	Line	Data
fd647775 SS	1	#!/usr/bin/env python3
	2
	3	"""
	4	Simple parser for spec compliant toml files
	5
	6	A simple toml parser for files that comply with the spec.
	7	Should only be used to parse `pyproject.toml` for `install_deps.py`.
	8
	9	IMPORTANT: INVALID FILES OR MULTILINE STRINGS ARE NOT SUPPORTED!
	10	"""
	11
	12	from __future__ import annotations
	13
	14	import datetime
	15	import json
	16	import re
	17
	18	WS = r'(?:[\ \t]*)'
	19	STRING_RE = re.compile(r'"(?:\\.\|[^\\"\n])"\|\'[^\'\n]\'')
	20	SINGLE_KEY_RE = re.compile(rf'{STRING_RE.pattern}\|[A-Za-z0-9_-]+')
	21	KEY_RE = re.compile(rf'{WS}(?:{SINGLE_KEY_RE.pattern}){WS}(?:\.{WS}(?:{SINGLE_KEY_RE.pattern}){WS})*')
	22	EQUALS_RE = re.compile(rf'={WS}')
	23	WS_RE = re.compile(WS)
	24
	25	_SUBTABLE = rf'(?P<subtable>^\[(?P<is_list>\[)?(?P<path>{KEY_RE.pattern})\]\]?)'
	26	EXPRESSION_RE = re.compile(rf'^(?:{_SUBTABLE}\|{KEY_RE.pattern}=)', re.MULTILINE)
	27
	28	LIST_WS_RE = re.compile(rf'{WS}((#[^\n])?\n{WS})')
	29	LEFTOVER_VALUE_RE = re.compile(r'[^,}\]\t\n#]+')
	30
	31
	32	def parse_key(value: str):
	33	for match in SINGLE_KEY_RE.finditer(value):
	34	if match[0][0] == '"':
	35	yield json.loads(match[0])
	36	elif match[0][0] == '\'':
	37	yield match[0][1:-1]
	38	else:
	39	yield match[0]
	40
	41
	42	def get_target(root: dict, paths: list[str], is_list=False):
	43	target = root
	44
	45	for index, key in enumerate(paths, 1):
	46	use_list = is_list and index == len(paths)
	47	result = target.get(key)
	48	if result is None:
	49	result = [] if use_list else {}
	50	target[key] = result
	51
	52	if isinstance(result, dict):
	53	target = result
	54	elif use_list:
	55	target = {}
	56	result.append(target)
	57	else:
	58	target = result[-1]
	59
	60	assert isinstance(target, dict)
	61	return target
	62
	63
	64	def parse_enclosed(data: str, index: int, end: str, ws_re: re.Pattern):
65	index += 1
66
67	if match := ws_re.match(data, index):
68	index = match.end()
69
70	while data[index] != end:
71	index = yield True, index
72
73	if match := ws_re.match(data, index):
74	index = match.end()
75
76	if data[index] == ',':
77	index += 1
78
79	if match := ws_re.match(data, index):
80	index = match.end()
81
82	assert data[index] == end
83	yield False, index + 1
84
85
86	def parse_value(data: str, index: int):
87	if data[index] == '[':
88	result = []
89
90	indices = parse_enclosed(data, index, ']', LIST_WS_RE)
91	valid, index = next(indices)
92	while valid:
93	index, value = parse_value(data, index)
94	result.append(value)
95	valid, index = indices.send(index)
96
97	return index, result
98
99	if data[index] == '{':
100	result = {}
101
102	indices = parse_enclosed(data, index, '}', WS_RE)
103	valid, index = next(indices)
104	while valid:
105	valid, index = indices.send(parse_kv_pair(data, index, result))
106
107	return index, result
108
109	if match := STRING_RE.match(data, index):
110	return match.end(), json.loads(match[0]) if match[0][0] == '"' else match[0][1:-1]
111
112	match = LEFTOVER_VALUE_RE.match(data, index)
113	assert match
114	value = match[0].strip()
115	for func in [
116	int,
117	float,
118	datetime.time.fromisoformat,
119	datetime.date.fromisoformat,
120	datetime.datetime.fromisoformat,
121	{'true': True, 'false': False}.get,
122	]:
123	try:
124	value = func(value)
125	break
126	except Exception:
127	pass
128
129	return match.end(), value
130
131
132	def parse_kv_pair(data: str, index: int, target: dict):
133	match = KEY_RE.match(data, index)
134	if not match:
135	return None
136
137	*keys, key = parse_key(match[0])
138
139	match = EQUALS_RE.match(data, match.end())
140	assert match
141	index = match.end()
142
143	index, value = parse_value(data, index)
144	get_target(target, keys)[key] = value
145	return index
146
147
148	def parse_toml(data: str):
149	root = {}
150	target = root
151
152	index = 0
153	while True:
154	match = EXPRESSION_RE.search(data, index)
155	if not match:
156	break
157
158	if match.group('subtable'):
159	index = match.end()
160	path, is_list = match.group('path', 'is_list')
161	target = get_target(root, list(parse_key(path)), bool(is_list))
162	continue
163
164	index = parse_kv_pair(data, match.start(), target)
165	assert index is not None
166
167	return root
168
169
170	def main():
171	import argparse
172	from pathlib import Path
173
174	parser = argparse.ArgumentParser()
175	parser.add_argument('infile', type=Path, help='The TOML file to read as input')
176	args = parser.parse_args()
177
178	with args.infile.open('r', encoding='utf-8') as file:
179	data = file.read()
180
181	def default(obj):
182	if isinstance(obj, (datetime.date, datetime.time, datetime.datetime)):
183	return obj.isoformat()
184
185	print(json.dumps(parse_toml(data), default=default))
186
187
188	if __name__ == '__main__':
189	main()