parser.py 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141
  1. from __future__ import annotations
  2. import datetime
  3. import re
  4. import string
  5. from tomlkit._compat import decode
  6. from tomlkit._utils import RFC_3339_LOOSE
  7. from tomlkit._utils import _escaped
  8. from tomlkit._utils import parse_rfc3339
  9. from tomlkit.container import Container
  10. from tomlkit.exceptions import EmptyKeyError
  11. from tomlkit.exceptions import EmptyTableNameError
  12. from tomlkit.exceptions import InternalParserError
  13. from tomlkit.exceptions import InvalidCharInStringError
  14. from tomlkit.exceptions import InvalidControlChar
  15. from tomlkit.exceptions import InvalidDateError
  16. from tomlkit.exceptions import InvalidDateTimeError
  17. from tomlkit.exceptions import InvalidNumberError
  18. from tomlkit.exceptions import InvalidTimeError
  19. from tomlkit.exceptions import InvalidUnicodeValueError
  20. from tomlkit.exceptions import ParseError
  21. from tomlkit.exceptions import UnexpectedCharError
  22. from tomlkit.exceptions import UnexpectedEofError
  23. from tomlkit.items import AoT
  24. from tomlkit.items import Array
  25. from tomlkit.items import Bool
  26. from tomlkit.items import BoolType
  27. from tomlkit.items import Comment
  28. from tomlkit.items import Date
  29. from tomlkit.items import DateTime
  30. from tomlkit.items import Float
  31. from tomlkit.items import InlineTable
  32. from tomlkit.items import Integer
  33. from tomlkit.items import Item
  34. from tomlkit.items import Key
  35. from tomlkit.items import KeyType
  36. from tomlkit.items import Null
  37. from tomlkit.items import SingleKey
  38. from tomlkit.items import String
  39. from tomlkit.items import StringType
  40. from tomlkit.items import Table
  41. from tomlkit.items import Time
  42. from tomlkit.items import Trivia
  43. from tomlkit.items import Whitespace
  44. from tomlkit.source import Source
  45. from tomlkit.toml_char import TOMLChar
  46. from tomlkit.toml_document import TOMLDocument
  47. CTRL_I = 0x09 # Tab
  48. CTRL_J = 0x0A # Line feed
  49. CTRL_M = 0x0D # Carriage return
  50. CTRL_CHAR_LIMIT = 0x1F
  51. CHR_DEL = 0x7F
  52. class Parser:
  53. """
  54. Parser for TOML documents.
  55. """
  56. def __init__(self, string: str | bytes) -> None:
  57. # Input to parse
  58. self._src = Source(decode(string))
  59. self._aot_stack: list[Key] = []
  60. @property
  61. def _state(self):
  62. return self._src.state
  63. @property
  64. def _idx(self):
  65. return self._src.idx
  66. @property
  67. def _current(self):
  68. return self._src.current
  69. @property
  70. def _marker(self):
  71. return self._src.marker
  72. def extract(self) -> str:
  73. """
  74. Extracts the value between marker and index
  75. """
  76. return self._src.extract()
  77. def inc(self, exception: type[ParseError] | None = None) -> bool:
  78. """
  79. Increments the parser if the end of the input has not been reached.
  80. Returns whether or not it was able to advance.
  81. """
  82. return self._src.inc(exception=exception)
  83. def inc_n(self, n: int, exception: type[ParseError] | None = None) -> bool:
  84. """
  85. Increments the parser by n characters
  86. if the end of the input has not been reached.
  87. """
  88. return self._src.inc_n(n=n, exception=exception)
  89. def consume(self, chars, min=0, max=-1):
  90. """
  91. Consume chars until min/max is satisfied is valid.
  92. """
  93. return self._src.consume(chars=chars, min=min, max=max)
  94. def end(self) -> bool:
  95. """
  96. Returns True if the parser has reached the end of the input.
  97. """
  98. return self._src.end()
  99. def mark(self) -> None:
  100. """
  101. Sets the marker to the index's current position
  102. """
  103. self._src.mark()
  104. def parse_error(self, exception=ParseError, *args, **kwargs):
  105. """
  106. Creates a generic "parse error" at the current position.
  107. """
  108. return self._src.parse_error(exception, *args, **kwargs)
  109. def parse(self) -> TOMLDocument:
  110. body = TOMLDocument(True)
  111. # Take all keyvals outside of tables/AoT's.
  112. while not self.end():
  113. # Break out if a table is found
  114. if self._current == "[":
  115. break
  116. # Otherwise, take and append one KV
  117. item = self._parse_item()
  118. if not item:
  119. break
  120. key, value = item
  121. if (key is not None and key.is_multi()) or not self._merge_ws(value, body):
  122. # We actually have a table
  123. try:
  124. body.append(key, value)
  125. except Exception as e:
  126. raise self.parse_error(ParseError, str(e)) from e
  127. self.mark()
  128. while not self.end():
  129. key, value = self._parse_table()
  130. if isinstance(value, Table) and value.is_aot_element():
  131. # This is just the first table in an AoT. Parse the rest of the array
  132. # along with it.
  133. value = self._parse_aot(value, key)
  134. try:
  135. body.append(key, value)
  136. except Exception as e:
  137. raise self.parse_error(ParseError, str(e)) from e
  138. body.parsing(False)
  139. return body
  140. def _merge_ws(self, item: Item, container: Container) -> bool:
  141. """
  142. Merges the given Item with the last one currently in the given Container if
  143. both are whitespace items.
  144. Returns True if the items were merged.
  145. """
  146. last = container.last_item()
  147. if not last:
  148. return False
  149. if not isinstance(item, Whitespace) or not isinstance(last, Whitespace):
  150. return False
  151. start = self._idx - (len(last.s) + len(item.s))
  152. container.body[-1] = (
  153. container.body[-1][0],
  154. Whitespace(self._src[start : self._idx]),
  155. )
  156. return True
  157. def _is_child(self, parent: Key, child: Key) -> bool:
  158. """
  159. Returns whether a key is strictly a child of another key.
  160. AoT siblings are not considered children of one another.
  161. """
  162. parent_parts = tuple(parent)
  163. child_parts = tuple(child)
  164. if parent_parts == child_parts:
  165. return False
  166. return parent_parts == child_parts[: len(parent_parts)]
  167. def _parse_item(self) -> tuple[Key | None, Item] | None:
  168. """
  169. Attempts to parse the next item and returns it, along with its key
  170. if the item is value-like.
  171. """
  172. self.mark()
  173. with self._state as state:
  174. while True:
  175. c = self._current
  176. if c == "\n":
  177. # Found a newline; Return all whitespace found up to this point.
  178. self.inc()
  179. return None, Whitespace(self.extract())
  180. elif c in " \t\r":
  181. # Skip whitespace.
  182. if not self.inc():
  183. return None, Whitespace(self.extract())
  184. elif c == "#":
  185. # Found a comment, parse it
  186. indent = self.extract()
  187. cws, comment, trail = self._parse_comment_trail()
  188. return None, Comment(Trivia(indent, cws, comment, trail))
  189. elif c == "[":
  190. # Found a table, delegate to the calling function.
  191. return
  192. else:
  193. # Beginning of a KV pair.
  194. # Return to beginning of whitespace so it gets included
  195. # as indentation for the KV about to be parsed.
  196. state.restore = True
  197. break
  198. return self._parse_key_value(True)
  199. def _parse_comment_trail(self, parse_trail: bool = True) -> tuple[str, str, str]:
  200. """
  201. Returns (comment_ws, comment, trail)
  202. If there is no comment, comment_ws and comment will
  203. simply be empty.
  204. """
  205. if self.end():
  206. return "", "", ""
  207. comment = ""
  208. comment_ws = ""
  209. self.mark()
  210. while True:
  211. c = self._current
  212. if c == "\n":
  213. break
  214. elif c == "#":
  215. comment_ws = self.extract()
  216. self.mark()
  217. self.inc() # Skip #
  218. # The comment itself
  219. while not self.end() and not self._current.is_nl():
  220. code = ord(self._current)
  221. if code == CHR_DEL or code <= CTRL_CHAR_LIMIT and code != CTRL_I:
  222. raise self.parse_error(InvalidControlChar, code, "comments")
  223. if not self.inc():
  224. break
  225. comment = self.extract()
  226. self.mark()
  227. break
  228. elif c in " \t\r":
  229. self.inc()
  230. else:
  231. raise self.parse_error(UnexpectedCharError, c)
  232. if self.end():
  233. break
  234. trail = ""
  235. if parse_trail:
  236. while self._current.is_spaces() and self.inc():
  237. pass
  238. if self._current == "\r":
  239. self.inc()
  240. if self._current == "\n":
  241. self.inc()
  242. if self._idx != self._marker or self._current.is_ws():
  243. trail = self.extract()
  244. return comment_ws, comment, trail
  245. def _parse_key_value(self, parse_comment: bool = False) -> tuple[Key, Item]:
  246. # Leading indent
  247. self.mark()
  248. while self._current.is_spaces() and self.inc():
  249. pass
  250. indent = self.extract()
  251. # Key
  252. key = self._parse_key()
  253. self.mark()
  254. found_equals = self._current == "="
  255. while self._current.is_kv_sep() and self.inc():
  256. if self._current == "=":
  257. if found_equals:
  258. raise self.parse_error(UnexpectedCharError, "=")
  259. else:
  260. found_equals = True
  261. if not found_equals:
  262. raise self.parse_error(UnexpectedCharError, self._current)
  263. if not key.sep:
  264. key.sep = self.extract()
  265. else:
  266. key.sep += self.extract()
  267. # Value
  268. val = self._parse_value()
  269. # Comment
  270. if parse_comment:
  271. cws, comment, trail = self._parse_comment_trail()
  272. meta = val.trivia
  273. if not meta.comment_ws:
  274. meta.comment_ws = cws
  275. meta.comment = comment
  276. meta.trail = trail
  277. else:
  278. val.trivia.trail = ""
  279. val.trivia.indent = indent
  280. return key, val
  281. def _parse_key(self) -> Key:
  282. """
  283. Parses a Key at the current position;
  284. WS before the key must be exhausted first at the callsite.
  285. """
  286. self.mark()
  287. while self._current.is_spaces() and self.inc():
  288. # Skip any leading whitespace
  289. pass
  290. if self._current in "\"'":
  291. return self._parse_quoted_key()
  292. else:
  293. return self._parse_bare_key()
  294. def _parse_quoted_key(self) -> Key:
  295. """
  296. Parses a key enclosed in either single or double quotes.
  297. """
  298. # Extract the leading whitespace
  299. original = self.extract()
  300. quote_style = self._current
  301. key_type = next((t for t in KeyType if t.value == quote_style), None)
  302. if key_type is None:
  303. raise RuntimeError("Should not have entered _parse_quoted_key()")
  304. key_str = self._parse_string(
  305. StringType.SLB if key_type == KeyType.Basic else StringType.SLL
  306. )
  307. if key_str._t.is_multiline():
  308. raise self.parse_error(UnexpectedCharError, key_str._t.value)
  309. original += key_str.as_string()
  310. self.mark()
  311. while self._current.is_spaces() and self.inc():
  312. pass
  313. original += self.extract()
  314. key = SingleKey(str(key_str), t=key_type, sep="", original=original)
  315. if self._current == ".":
  316. self.inc()
  317. key = key.concat(self._parse_key())
  318. return key
  319. def _parse_bare_key(self) -> Key:
  320. """
  321. Parses a bare key.
  322. """
  323. while (
  324. self._current.is_bare_key_char() or self._current.is_spaces()
  325. ) and self.inc():
  326. pass
  327. original = self.extract()
  328. key = original.strip()
  329. if not key:
  330. # Empty key
  331. raise self.parse_error(EmptyKeyError)
  332. if " " in key:
  333. # Bare key with spaces in it
  334. raise self.parse_error(ParseError, f'Invalid key "{key}"')
  335. key = SingleKey(key, KeyType.Bare, "", original)
  336. if self._current == ".":
  337. self.inc()
  338. key = key.concat(self._parse_key())
  339. return key
  340. def _parse_value(self) -> Item:
  341. """
  342. Attempts to parse a value at the current position.
  343. """
  344. self.mark()
  345. c = self._current
  346. trivia = Trivia()
  347. if c == StringType.SLB.value:
  348. return self._parse_basic_string()
  349. elif c == StringType.SLL.value:
  350. return self._parse_literal_string()
  351. elif c == BoolType.TRUE.value[0]:
  352. return self._parse_true()
  353. elif c == BoolType.FALSE.value[0]:
  354. return self._parse_false()
  355. elif c == "[":
  356. return self._parse_array()
  357. elif c == "{":
  358. return self._parse_inline_table()
  359. elif c in "+-" or self._peek(4) in {
  360. "+inf",
  361. "-inf",
  362. "inf",
  363. "+nan",
  364. "-nan",
  365. "nan",
  366. }:
  367. # Number
  368. while self._current not in " \t\n\r#,]}" and self.inc():
  369. pass
  370. raw = self.extract()
  371. item = self._parse_number(raw, trivia)
  372. if item is not None:
  373. return item
  374. raise self.parse_error(InvalidNumberError)
  375. elif c in string.digits:
  376. # Integer, Float, Date, Time or DateTime
  377. while self._current not in " \t\n\r#,]}" and self.inc():
  378. pass
  379. raw = self.extract()
  380. m = RFC_3339_LOOSE.match(raw)
  381. if m:
  382. if m.group(1) and m.group(5):
  383. # datetime
  384. try:
  385. dt = parse_rfc3339(raw)
  386. assert isinstance(dt, datetime.datetime)
  387. return DateTime(
  388. dt.year,
  389. dt.month,
  390. dt.day,
  391. dt.hour,
  392. dt.minute,
  393. dt.second,
  394. dt.microsecond,
  395. dt.tzinfo,
  396. trivia,
  397. raw,
  398. )
  399. except ValueError:
  400. raise self.parse_error(InvalidDateTimeError)
  401. if m.group(1):
  402. try:
  403. dt = parse_rfc3339(raw)
  404. assert isinstance(dt, datetime.date)
  405. date = Date(dt.year, dt.month, dt.day, trivia, raw)
  406. self.mark()
  407. while self._current not in "\t\n\r#,]}" and self.inc():
  408. pass
  409. time_raw = self.extract()
  410. time_part = time_raw.rstrip()
  411. trivia.comment_ws = time_raw[len(time_part) :]
  412. if not time_part:
  413. return date
  414. dt = parse_rfc3339(raw + time_part)
  415. assert isinstance(dt, datetime.datetime)
  416. return DateTime(
  417. dt.year,
  418. dt.month,
  419. dt.day,
  420. dt.hour,
  421. dt.minute,
  422. dt.second,
  423. dt.microsecond,
  424. dt.tzinfo,
  425. trivia,
  426. raw + time_part,
  427. )
  428. except ValueError:
  429. raise self.parse_error(InvalidDateError)
  430. if m.group(5):
  431. try:
  432. t = parse_rfc3339(raw)
  433. assert isinstance(t, datetime.time)
  434. return Time(
  435. t.hour,
  436. t.minute,
  437. t.second,
  438. t.microsecond,
  439. t.tzinfo,
  440. trivia,
  441. raw,
  442. )
  443. except ValueError:
  444. raise self.parse_error(InvalidTimeError)
  445. item = self._parse_number(raw, trivia)
  446. if item is not None:
  447. return item
  448. raise self.parse_error(InvalidNumberError)
  449. else:
  450. raise self.parse_error(UnexpectedCharError, c)
  451. def _parse_true(self):
  452. return self._parse_bool(BoolType.TRUE)
  453. def _parse_false(self):
  454. return self._parse_bool(BoolType.FALSE)
  455. def _parse_bool(self, style: BoolType) -> Bool:
  456. with self._state:
  457. style = BoolType(style)
  458. # only keep parsing for bool if the characters match the style
  459. # try consuming rest of chars in style
  460. for c in style:
  461. self.consume(c, min=1, max=1)
  462. return Bool(style, Trivia())
  463. def _parse_array(self) -> Array:
  464. # Consume opening bracket, EOF here is an issue (middle of array)
  465. self.inc(exception=UnexpectedEofError)
  466. elems: list[Item] = []
  467. prev_value = None
  468. while True:
  469. # consume whitespace
  470. mark = self._idx
  471. self.consume(TOMLChar.SPACES + TOMLChar.NL)
  472. indent = self._src[mark : self._idx]
  473. newline = set(TOMLChar.NL) & set(indent)
  474. if newline:
  475. elems.append(Whitespace(indent))
  476. continue
  477. # consume comment
  478. if self._current == "#":
  479. cws, comment, trail = self._parse_comment_trail(parse_trail=False)
  480. elems.append(Comment(Trivia(indent, cws, comment, trail)))
  481. continue
  482. # consume indent
  483. if indent:
  484. elems.append(Whitespace(indent))
  485. continue
  486. # consume value
  487. if not prev_value:
  488. try:
  489. elems.append(self._parse_value())
  490. prev_value = True
  491. continue
  492. except UnexpectedCharError:
  493. pass
  494. # consume comma
  495. if prev_value and self._current == ",":
  496. self.inc(exception=UnexpectedEofError)
  497. elems.append(Whitespace(","))
  498. prev_value = False
  499. continue
  500. # consume closing bracket
  501. if self._current == "]":
  502. # consume closing bracket, EOF here doesn't matter
  503. self.inc()
  504. break
  505. raise self.parse_error(UnexpectedCharError, self._current)
  506. try:
  507. res = Array(elems, Trivia())
  508. except ValueError:
  509. pass
  510. else:
  511. return res
  512. def _parse_inline_table(self) -> InlineTable:
  513. # consume opening bracket, EOF here is an issue (middle of array)
  514. self.inc(exception=UnexpectedEofError)
  515. elems = Container(True)
  516. trailing_comma = None
  517. while True:
  518. # consume leading whitespace
  519. mark = self._idx
  520. self.consume(TOMLChar.SPACES)
  521. raw = self._src[mark : self._idx]
  522. if raw:
  523. elems.add(Whitespace(raw))
  524. if not trailing_comma:
  525. # None: empty inline table
  526. # False: previous key-value pair was not followed by a comma
  527. if self._current == "}":
  528. # consume closing bracket, EOF here doesn't matter
  529. self.inc()
  530. break
  531. if (
  532. trailing_comma is False
  533. or trailing_comma is None
  534. and self._current == ","
  535. ):
  536. # Either the previous key-value pair was not followed by a comma
  537. # or the table has an unexpected leading comma.
  538. raise self.parse_error(UnexpectedCharError, self._current)
  539. else:
  540. # True: previous key-value pair was followed by a comma
  541. if self._current == "}" or self._current == ",":
  542. raise self.parse_error(UnexpectedCharError, self._current)
  543. key, val = self._parse_key_value(False)
  544. elems.add(key, val)
  545. # consume trailing whitespace
  546. mark = self._idx
  547. self.consume(TOMLChar.SPACES)
  548. raw = self._src[mark : self._idx]
  549. if raw:
  550. elems.add(Whitespace(raw))
  551. # consume trailing comma
  552. trailing_comma = self._current == ","
  553. if trailing_comma:
  554. # consume closing bracket, EOF here is an issue (middle of inline table)
  555. self.inc(exception=UnexpectedEofError)
  556. return InlineTable(elems, Trivia())
  557. def _parse_number(self, raw: str, trivia: Trivia) -> Item | None:
  558. # Leading zeros are not allowed
  559. sign = ""
  560. if raw.startswith(("+", "-")):
  561. sign = raw[0]
  562. raw = raw[1:]
  563. if len(raw) > 1 and (
  564. raw.startswith("0")
  565. and not raw.startswith(("0.", "0o", "0x", "0b", "0e"))
  566. or sign
  567. and raw.startswith(".")
  568. ):
  569. return None
  570. if raw.startswith(("0o", "0x", "0b")) and sign:
  571. return None
  572. digits = "[0-9]"
  573. base = 10
  574. if raw.startswith("0b"):
  575. digits = "[01]"
  576. base = 2
  577. elif raw.startswith("0o"):
  578. digits = "[0-7]"
  579. base = 8
  580. elif raw.startswith("0x"):
  581. digits = "[0-9a-f]"
  582. base = 16
  583. # Underscores should be surrounded by digits
  584. clean = re.sub(f"(?i)(?<={digits})_(?={digits})", "", raw).lower()
  585. if "_" in clean:
  586. return None
  587. if (
  588. clean.endswith(".")
  589. or not clean.startswith("0x")
  590. and clean.split("e", 1)[0].endswith(".")
  591. ):
  592. return None
  593. try:
  594. return Integer(int(sign + clean, base), trivia, sign + raw)
  595. except ValueError:
  596. try:
  597. return Float(float(sign + clean), trivia, sign + raw)
  598. except ValueError:
  599. return None
  600. def _parse_literal_string(self) -> String:
  601. with self._state:
  602. return self._parse_string(StringType.SLL)
  603. def _parse_basic_string(self) -> String:
  604. with self._state:
  605. return self._parse_string(StringType.SLB)
  606. def _parse_escaped_char(self, multiline):
  607. if multiline and self._current.is_ws():
  608. # When the last non-whitespace character on a line is
  609. # a \, it will be trimmed along with all whitespace
  610. # (including newlines) up to the next non-whitespace
  611. # character or closing delimiter.
  612. # """\
  613. # hello \
  614. # world"""
  615. tmp = ""
  616. while self._current.is_ws():
  617. tmp += self._current
  618. # consume the whitespace, EOF here is an issue
  619. # (middle of string)
  620. self.inc(exception=UnexpectedEofError)
  621. continue
  622. # the escape followed by whitespace must have a newline
  623. # before any other chars
  624. if "\n" not in tmp:
  625. raise self.parse_error(InvalidCharInStringError, self._current)
  626. return ""
  627. if self._current in _escaped:
  628. c = _escaped[self._current]
  629. # consume this char, EOF here is an issue (middle of string)
  630. self.inc(exception=UnexpectedEofError)
  631. return c
  632. if self._current in {"u", "U"}:
  633. # this needs to be a unicode
  634. u, ue = self._peek_unicode(self._current == "U")
  635. if u is not None:
  636. # consume the U char and the unicode value
  637. self.inc_n(len(ue) + 1)
  638. return u
  639. raise self.parse_error(InvalidUnicodeValueError)
  640. raise self.parse_error(InvalidCharInStringError, self._current)
  641. def _parse_string(self, delim: StringType) -> String:
  642. # only keep parsing for string if the current character matches the delim
  643. if self._current != delim.unit:
  644. raise self.parse_error(
  645. InternalParserError,
  646. f"Invalid character for string type {delim}",
  647. )
  648. # consume the opening/first delim, EOF here is an issue
  649. # (middle of string or middle of delim)
  650. self.inc(exception=UnexpectedEofError)
  651. if self._current == delim.unit:
  652. # consume the closing/second delim, we do not care if EOF occurs as
  653. # that would simply imply an empty single line string
  654. if not self.inc() or self._current != delim.unit:
  655. # Empty string
  656. return String(delim, "", "", Trivia())
  657. # consume the third delim, EOF here is an issue (middle of string)
  658. self.inc(exception=UnexpectedEofError)
  659. delim = delim.toggle() # convert delim to multi delim
  660. self.mark() # to extract the original string with whitespace and all
  661. value = ""
  662. # A newline immediately following the opening delimiter will be trimmed.
  663. if delim.is_multiline():
  664. if self._current == "\n":
  665. # consume the newline, EOF here is an issue (middle of string)
  666. self.inc(exception=UnexpectedEofError)
  667. else:
  668. cur = self._current
  669. with self._state(restore=True):
  670. if self.inc():
  671. cur += self._current
  672. if cur == "\r\n":
  673. self.inc_n(2, exception=UnexpectedEofError)
  674. escaped = False # whether the previous key was ESCAPE
  675. while True:
  676. code = ord(self._current)
  677. if (
  678. delim.is_singleline()
  679. and not escaped
  680. and (code == CHR_DEL or code <= CTRL_CHAR_LIMIT and code != CTRL_I)
  681. ) or (
  682. delim.is_multiline()
  683. and not escaped
  684. and (
  685. code == CHR_DEL
  686. or code <= CTRL_CHAR_LIMIT
  687. and code not in [CTRL_I, CTRL_J, CTRL_M]
  688. )
  689. ):
  690. raise self.parse_error(InvalidControlChar, code, "strings")
  691. elif not escaped and self._current == delim.unit:
  692. # try to process current as a closing delim
  693. original = self.extract()
  694. close = ""
  695. if delim.is_multiline():
  696. # Consume the delimiters to see if we are at the end of the string
  697. close = ""
  698. while self._current == delim.unit:
  699. close += self._current
  700. self.inc()
  701. if len(close) < 3:
  702. # Not a triple quote, leave in result as-is.
  703. # Adding back the characters we already consumed
  704. value += close
  705. continue
  706. if len(close) == 3:
  707. # We are at the end of the string
  708. return String(delim, value, original, Trivia())
  709. if len(close) >= 6:
  710. raise self.parse_error(InvalidCharInStringError, self._current)
  711. value += close[:-3]
  712. original += close[:-3]
  713. return String(delim, value, original, Trivia())
  714. else:
  715. # consume the closing delim, we do not care if EOF occurs as
  716. # that would simply imply the end of self._src
  717. self.inc()
  718. return String(delim, value, original, Trivia())
  719. elif delim.is_basic() and escaped:
  720. # attempt to parse the current char as an escaped value, an exception
  721. # is raised if this fails
  722. value += self._parse_escaped_char(delim.is_multiline())
  723. # no longer escaped
  724. escaped = False
  725. elif delim.is_basic() and self._current == "\\":
  726. # the next char is being escaped
  727. escaped = True
  728. # consume this char, EOF here is an issue (middle of string)
  729. self.inc(exception=UnexpectedEofError)
  730. else:
  731. # this is either a literal string where we keep everything as is,
  732. # or this is not a special escaped char in a basic string
  733. value += self._current
  734. # consume this char, EOF here is an issue (middle of string)
  735. self.inc(exception=UnexpectedEofError)
  736. def _parse_table(
  737. self, parent_name: Key | None = None, parent: Table | None = None
  738. ) -> tuple[Key, Table | AoT]:
  739. """
  740. Parses a table element.
  741. """
  742. if self._current != "[":
  743. raise self.parse_error(
  744. InternalParserError, "_parse_table() called on non-bracket character."
  745. )
  746. indent = self.extract()
  747. self.inc() # Skip opening bracket
  748. if self.end():
  749. raise self.parse_error(UnexpectedEofError)
  750. is_aot = False
  751. if self._current == "[":
  752. if not self.inc():
  753. raise self.parse_error(UnexpectedEofError)
  754. is_aot = True
  755. try:
  756. key = self._parse_key()
  757. except EmptyKeyError:
  758. raise self.parse_error(EmptyTableNameError) from None
  759. if self.end():
  760. raise self.parse_error(UnexpectedEofError)
  761. elif self._current != "]":
  762. raise self.parse_error(UnexpectedCharError, self._current)
  763. key.sep = ""
  764. full_key = key
  765. name_parts = tuple(key)
  766. if any(" " in part.key.strip() and part.is_bare() for part in name_parts):
  767. raise self.parse_error(
  768. ParseError, f'Invalid table name "{full_key.as_string()}"'
  769. )
  770. missing_table = False
  771. if parent_name:
  772. parent_name_parts = tuple(parent_name)
  773. else:
  774. parent_name_parts = ()
  775. if len(name_parts) > len(parent_name_parts) + 1:
  776. missing_table = True
  777. name_parts = name_parts[len(parent_name_parts) :]
  778. values = Container(True)
  779. self.inc() # Skip closing bracket
  780. if is_aot:
  781. # TODO: Verify close bracket
  782. self.inc()
  783. cws, comment, trail = self._parse_comment_trail()
  784. result = Null()
  785. table = Table(
  786. values,
  787. Trivia(indent, cws, comment, trail),
  788. is_aot,
  789. name=name_parts[0].key if name_parts else key.key,
  790. display_name=full_key.as_string(),
  791. is_super_table=False,
  792. )
  793. if len(name_parts) > 1:
  794. if missing_table:
  795. # Missing super table
  796. # i.e. a table initialized like this: [foo.bar]
  797. # without initializing [foo]
  798. #
  799. # So we have to create the parent tables
  800. table = Table(
  801. Container(True),
  802. Trivia("", cws, comment, trail),
  803. is_aot and name_parts[0] in self._aot_stack,
  804. is_super_table=True,
  805. name=name_parts[0].key,
  806. )
  807. result = table
  808. key = name_parts[0]
  809. for i, _name in enumerate(name_parts[1:]):
  810. child = table.get(
  811. _name,
  812. Table(
  813. Container(True),
  814. Trivia(indent, cws, comment, trail),
  815. is_aot and i == len(name_parts) - 2,
  816. is_super_table=i < len(name_parts) - 2,
  817. name=_name.key,
  818. display_name=full_key.as_string()
  819. if i == len(name_parts) - 2
  820. else None,
  821. ),
  822. )
  823. if is_aot and i == len(name_parts) - 2:
  824. table.raw_append(_name, AoT([child], name=table.name, parsed=True))
  825. else:
  826. table.raw_append(_name, child)
  827. table = child
  828. values = table.value
  829. else:
  830. if name_parts:
  831. key = name_parts[0]
  832. while not self.end():
  833. item = self._parse_item()
  834. if item:
  835. _key, item = item
  836. if not self._merge_ws(item, values):
  837. table.raw_append(_key, item)
  838. else:
  839. if self._current == "[":
  840. _, key_next = self._peek_table()
  841. if self._is_child(full_key, key_next):
  842. key_next, table_next = self._parse_table(full_key, table)
  843. table.raw_append(key_next, table_next)
  844. # Picking up any sibling
  845. while not self.end():
  846. _, key_next = self._peek_table()
  847. if not self._is_child(full_key, key_next):
  848. break
  849. key_next, table_next = self._parse_table(full_key, table)
  850. table.raw_append(key_next, table_next)
  851. break
  852. else:
  853. raise self.parse_error(
  854. InternalParserError,
  855. "_parse_item() returned None on a non-bracket character.",
  856. )
  857. if isinstance(result, Null):
  858. result = table
  859. if is_aot and (not self._aot_stack or full_key != self._aot_stack[-1]):
  860. result = self._parse_aot(result, full_key)
  861. return key, result
  862. def _peek_table(self) -> tuple[bool, Key]:
  863. """
  864. Peeks ahead non-intrusively by cloning then restoring the
  865. initial state of the parser.
  866. Returns the name of the table about to be parsed,
  867. as well as whether it is part of an AoT.
  868. """
  869. # we always want to restore after exiting this scope
  870. with self._state(save_marker=True, restore=True):
  871. if self._current != "[":
  872. raise self.parse_error(
  873. InternalParserError,
  874. "_peek_table() entered on non-bracket character",
  875. )
  876. # AoT
  877. self.inc()
  878. is_aot = False
  879. if self._current == "[":
  880. self.inc()
  881. is_aot = True
  882. try:
  883. return is_aot, self._parse_key()
  884. except EmptyKeyError:
  885. raise self.parse_error(EmptyTableNameError) from None
  886. def _parse_aot(self, first: Table, name_first: Key) -> AoT:
  887. """
  888. Parses all siblings of the provided table first and bundles them into
  889. an AoT.
  890. """
  891. payload = [first]
  892. self._aot_stack.append(name_first)
  893. while not self.end():
  894. is_aot_next, name_next = self._peek_table()
  895. if is_aot_next and name_next == name_first:
  896. _, table = self._parse_table(name_first)
  897. payload.append(table)
  898. else:
  899. break
  900. self._aot_stack.pop()
  901. return AoT(payload, parsed=True)
  902. def _peek(self, n: int) -> str:
  903. """
  904. Peeks ahead n characters.
  905. n is the max number of characters that will be peeked.
  906. """
  907. # we always want to restore after exiting this scope
  908. with self._state(restore=True):
  909. buf = ""
  910. for _ in range(n):
  911. if self._current not in " \t\n\r#,]}" + self._src.EOF:
  912. buf += self._current
  913. self.inc()
  914. continue
  915. break
  916. return buf
  917. def _peek_unicode(self, is_long: bool) -> tuple[str | None, str | None]:
  918. """
  919. Peeks ahead non-intrusively by cloning then restoring the
  920. initial state of the parser.
  921. Returns the unicode value is it's a valid one else None.
  922. """
  923. # we always want to restore after exiting this scope
  924. with self._state(save_marker=True, restore=True):
  925. if self._current not in {"u", "U"}:
  926. raise self.parse_error(
  927. InternalParserError, "_peek_unicode() entered on non-unicode value"
  928. )
  929. self.inc() # Dropping prefix
  930. self.mark()
  931. if is_long:
  932. chars = 8
  933. else:
  934. chars = 4
  935. if not self.inc_n(chars):
  936. value, extracted = None, None
  937. else:
  938. extracted = self.extract()
  939. if extracted[0].lower() == "d" and extracted[1].strip("01234567"):
  940. return None, None
  941. try:
  942. value = chr(int(extracted, 16))
  943. except (ValueError, OverflowError):
  944. value = None
  945. return value, extracted