3 from .helpers
import DelimitedList
, any_open_tag
, any_close_tag
4 from datetime
import datetime
7 # some other useful expressions - using lower-case class name since we are really using this as a namespace
8 class pyparsing_common
:
9 """Here are some common low-level expressions that may be useful in
10 jump-starting parser development:
12 - numeric forms (:class:`integers<integer>`, :class:`reals<real>`,
13 :class:`scientific notation<sci_real>`)
14 - common :class:`programming identifiers<identifier>`
15 - network addresses (:class:`MAC<mac_address>`,
16 :class:`IPv4<ipv4_address>`, :class:`IPv6<ipv6_address>`)
17 - ISO8601 :class:`dates<iso8601_date>` and
18 :class:`datetime<iso8601_datetime>`
20 - :class:`comma-separated list<comma_separated_list>`
25 - :class:`convert_to_integer`
26 - :class:`convert_to_float`
27 - :class:`convert_to_date`
28 - :class:`convert_to_datetime`
29 - :class:`strip_html_tags`
30 - :class:`upcase_tokens`
31 - :class:`downcase_tokens`
35 pyparsing_common.number.run_tests('''
36 # any int or real number, returned as the appropriate type
45 pyparsing_common.fnumber.run_tests('''
46 # any int or real number, returned as float
55 pyparsing_common.hex_integer.run_tests('''
61 pyparsing_common.fraction.run_tests('''
67 pyparsing_common.mixed_integer.run_tests('''
76 pyparsing_common.uuid.set_parse_action(token_map(uuid.UUID))
77 pyparsing_common.uuid.run_tests('''
79 12345678-1234-5678-1234-567812345678
84 # any int or real number, returned as the appropriate type
103 # any int or real number, returned as float
150 12345678-1234-5678-1234-567812345678
151 [UUID('12345678-1234-5678-1234-567812345678')]
154 convert_to_integer
= token_map(int)
156 Parse action for converting parsed integers to Python int
159 convert_to_float
= token_map(float)
161 Parse action for converting parsed numbers to Python float
164 integer
= Word(nums
).set_name("integer").set_parse_action(convert_to_integer
)
165 """expression that parses an unsigned integer, returns an int"""
168 Word(hexnums
).set_name("hex integer").set_parse_action(token_map(int, 16))
170 """expression that parses a hexadecimal integer, returns an int"""
174 .set_name("signed integer")
175 .set_parse_action(convert_to_integer
)
177 """expression that parses an integer with optional leading sign, returns an int"""
180 signed_integer().set_parse_action(convert_to_float
)
182 + signed_integer().set_parse_action(convert_to_float
)
183 ).set_name("fraction")
184 """fractional expression of an integer divided by an integer, returns a float"""
185 fraction
.add_parse_action(lambda tt
: tt
[0] / tt
[-1])
188 fraction | signed_integer
+ Opt(Opt("-").suppress() + fraction
)
189 ).set_name("fraction or mixed integer-fraction")
190 """mixed integer of the form 'integer - fraction', with optional leading integer, returns float"""
191 mixed_integer
.add_parse_action(sum)
194 Regex(r
"[+-]?(?:\d+\.\d*|\.\d+)")
195 .set_name("real number")
196 .set_parse_action(convert_to_float
)
198 """expression that parses a floating point number and returns a float"""
201 Regex(r
"[+-]?(?:\d+(?:[eE][+-]?\d+)|(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)?)")
202 .set_name("real number with scientific notation")
203 .set_parse_action(convert_to_float
)
205 """expression that parses a floating point number with optional
206 scientific notation and returns a float"""
208 # streamlining this expression makes the docs nicer-looking
209 number
= (sci_real | real | signed_integer
).setName("number").streamline()
210 """any numeric expression, returns the corresponding Python type"""
213 Regex(r
"[+-]?\d+\.?\d*([eE][+-]?\d+)?")
215 .set_parse_action(convert_to_float
)
217 """any int or real number, returned as float"""
219 identifier
= Word(identchars
, identbodychars
).set_name("identifier")
220 """typical code identifier (leading alpha or '_', followed by 0 or more alphas, nums, or '_')"""
222 ipv4_address
= Regex(
223 r
"(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(\.(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}"
224 ).set_name("IPv4 address")
225 "IPv4 address (``0.0.0.0 - 255.255.255.255``)"
227 _ipv6_part
= Regex(r
"[0-9a-fA-F]{1,4}").set_name("hex_integer")
228 _full_ipv6_address
= (_ipv6_part
+ (":" + _ipv6_part
) * 7).set_name(
231 _short_ipv6_address
= (
232 Opt(_ipv6_part
+ (":" + _ipv6_part
) * (0, 6))
234 + Opt(_ipv6_part
+ (":" + _ipv6_part
) * (0, 6))
235 ).set_name("short IPv6 address")
236 _short_ipv6_address
.add_condition(
237 lambda t
: sum(1 for tt
in t
if pyparsing_common
._ipv
6_part
.matches(tt
)) < 8
239 _mixed_ipv6_address
= ("::ffff:" + ipv4_address
).set_name("mixed IPv6 address")
240 ipv6_address
= Combine(
241 (_full_ipv6_address | _mixed_ipv6_address | _short_ipv6_address
).set_name(
244 ).set_name("IPv6 address")
245 "IPv6 address (long, short, or mixed form)"
248 r
"[0-9a-fA-F]{2}([:.-])[0-9a-fA-F]{2}(?:\1[0-9a-fA-F]{2}){4}"
249 ).set_name("MAC address")
250 "MAC address xx:xx:xx:xx:xx (may also have '-' or '.' delimiters)"
253 def convert_to_date(fmt
: str = "%Y-%m-%d"):
255 Helper to create a parse action for converting parsed date string to Python datetime.date
258 - fmt - format to be passed to datetime.strptime (default= ``"%Y-%m-%d"``)
262 date_expr = pyparsing_common.iso8601_date.copy()
263 date_expr.set_parse_action(pyparsing_common.convert_to_date())
264 print(date_expr.parse_string("1999-12-31"))
268 [datetime.date(1999, 12, 31)]
271 def cvt_fn(ss
, ll
, tt
):
273 return datetime
.strptime(tt
[0], fmt
).date()
274 except ValueError as ve
:
275 raise ParseException(ss
, ll
, str(ve
))
280 def convert_to_datetime(fmt
: str = "%Y-%m-%dT%H:%M:%S.%f"):
281 """Helper to create a parse action for converting parsed
282 datetime string to Python datetime.datetime
285 - fmt - format to be passed to datetime.strptime (default= ``"%Y-%m-%dT%H:%M:%S.%f"``)
289 dt_expr = pyparsing_common.iso8601_datetime.copy()
290 dt_expr.set_parse_action(pyparsing_common.convert_to_datetime())
291 print(dt_expr.parse_string("1999-12-31T23:59:59.999"))
295 [datetime.datetime(1999, 12, 31, 23, 59, 59, 999000)]
300 return datetime
.strptime(t
[0], fmt
)
301 except ValueError as ve
:
302 raise ParseException(s
, l
, str(ve
))
306 iso8601_date
= Regex(
307 r
"(?P<year>\d{4})(?:-(?P<month>\d\d)(?:-(?P<day>\d\d))?)?"
308 ).set_name("ISO8601 date")
309 "ISO8601 date (``yyyy-mm-dd``)"
311 iso8601_datetime
= Regex(
312 r
"(?P<year>\d{4})-(?P<month>\d\d)-(?P<day>\d\d)[T ](?P<hour>\d\d):(?P<minute>\d\d)(:(?P<second>\d\d(\.\d*)?)?)?(?P<tz>Z|[+-]\d\d:?\d\d)?"
313 ).set_name("ISO8601 datetime")
314 "ISO8601 datetime (``yyyy-mm-ddThh:mm:ss.s(Z|+-00:00)``) - trailing seconds, milliseconds, and timezone optional; accepts separating ``'T'`` or ``' '``"
316 uuid
= Regex(r
"[0-9a-fA-F]{8}(-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}").set_name("UUID")
317 "UUID (``xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx``)"
319 _html_stripper
= any_open_tag
.suppress() | any_close_tag
.suppress()
322 def strip_html_tags(s
: str, l
: int, tokens
: ParseResults
):
323 """Parse action to remove HTML tags from web page HTML source
327 # strip HTML links from normal text
328 text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>'
329 td, td_end = make_html_tags("TD")
330 table_text = td + SkipTo(td_end).set_parse_action(pyparsing_common.strip_html_tags)("body") + td_end
331 print(table_text.parse_string(text).body)
335 More info at the pyparsing wiki page
337 return pyparsing_common
._html
_stripper
.transform_string(tokens
[0])
344 + Word(printables
, exclude_chars
=",")
345 + Opt(White(" \t") + ~
FollowedBy(LineEnd() |
","))
349 .set_name("commaItem")
351 comma_separated_list
= DelimitedList(
352 Opt(quoted_string
.copy() | _commasepitem
, default
="")
353 ).set_name("comma separated list")
354 """Predefined expression of 1 or more printable words or quoted strings, separated by commas."""
356 upcase_tokens
= staticmethod(token_map(lambda t
: t
.upper()))
357 """Parse action to convert tokens to upper case."""
359 downcase_tokens
= staticmethod(token_map(lambda t
: t
.lower()))
360 """Parse action to convert tokens to lower case."""
364 # https://mathiasbynens.be/demo/url-regex
365 # https://gist.github.com/dperini/729294
367 # protocol identifier (optional)
368 # short syntax // still required
369 r
"(?:(?:(?P<scheme>https?|ftp):)?\/\/)" +
370 # user:pass BasicAuth (optional)
371 r
"(?:(?P<auth>\S+(?::\S*)?)@)?" +
373 # IP address exclusion
374 # private & local networks
375 r
"(?!(?:10|127)(?:\.\d{1,3}){3})" +
376 r
"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" +
377 r
"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" +
378 # IP address dotted notation octets
379 # excludes loopback network 0.0.0.0
380 # excludes reserved space >= 224.0.0.0
381 # excludes network & broadcast addresses
382 # (first & last IP address of each class)
383 r
"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" +
384 r
"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" +
385 r
"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" +
387 # host & domain names, may end with dot
388 # can be replaced by a shortest alternative
389 # (?![-_])(?:[-\w\u00a1-\uffff]{0,63}[^-_]\.)+
392 r
"[a-z0-9\u00a1-\uffff]" +
393 r
"[a-z0-9\u00a1-\uffff_-]{0,62}" +
395 r
"[a-z0-9\u00a1-\uffff]\." +
397 # TLD identifier name, may end with dot
398 r
"(?:[a-z\u00a1-\uffff]{2,}\.?)" +
400 # port number (optional)
401 r
"(:(?P<port>\d{2,5}))?" +
402 # resource path (optional)
403 r
"(?P<path>\/[^?# ]*)?" +
404 # query string (optional)
405 r
"(\?(?P<query>[^#]*))?" +
406 # fragment (optional)
407 r
"(#(?P<fragment>\S*))?" +
410 """URL (http/https/ftp scheme)"""
413 # pre-PEP8 compatibility names
414 convertToInteger
= convert_to_integer
415 """Deprecated - use :class:`convert_to_integer`"""
416 convertToFloat
= convert_to_float
417 """Deprecated - use :class:`convert_to_float`"""
418 convertToDate
= convert_to_date
419 """Deprecated - use :class:`convert_to_date`"""
420 convertToDatetime
= convert_to_datetime
421 """Deprecated - use :class:`convert_to_datetime`"""
422 stripHTMLTags
= strip_html_tags
423 """Deprecated - use :class:`strip_html_tags`"""
424 upcaseTokens
= upcase_tokens
425 """Deprecated - use :class:`upcase_tokens`"""
426 downcaseTokens
= downcase_tokens
427 """Deprecated - use :class:`downcase_tokens`"""
431 v
for v
in vars(pyparsing_common
).values() if isinstance(v
, ParserElement
)