parse: apkindex: optimise parser (MR 2252)

Rewrite the parser to be more efficient:
* Avoid using str.startswith()
* Simplify logic by parsing backwards and using lines.pop() rather than
  passing around an index.

Testing with "pmbootstrap test apkindex_parse_all" on my x86 laptop:

Before: Parsed 78537 packages from 7 APKINDEX files in 1.131 seconds
After : Parsed 78537 packages from 7 APKINDEX files in 0.609 seconds

That makes for an ~86% improvement, almost twice as fast.

Signed-off-by: Caleb Connolly <caleb@postmarketos.org>
This commit is contained in:
Caleb Connolly 2024-06-12 21:12:49 +02:00 committed by Oliver Smith
parent 62d700c3d8
commit f186ee8498
No known key found for this signature in database
GPG key ID: 5AE7F5513E0885CB

View file

@ -1,7 +1,7 @@
# Copyright 2023 Oliver Smith # Copyright 2023 Oliver Smith
# SPDX-License-Identifier: GPL-3.0-or-later # SPDX-License-Identifier: GPL-3.0-or-later
import collections import collections
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional, Sequence, Union
from pmb.core.arch import Arch from pmb.core.arch import Arch
from pmb.core.context import get_context from pmb.core.context import get_context
from pmb.helpers import logging from pmb.helpers import logging
@ -12,8 +12,20 @@ import pmb.helpers.package
import pmb.helpers.repo import pmb.helpers.repo
import pmb.parse.version import pmb.parse.version
apkindex_map = {
"A": "arch",
"D": "depends",
"o": "origin",
"P": "pkgname",
"p": "provides",
"k": "provider_priority",
"t": "timestamp",
"V": "version",
}
def parse_next_block(path: Path, lines, start): required_apkindex_keys = ["arch", "pkgname", "version"]
def parse_next_block(path: Path, lines: List[str]):
"""Parse the next block in an APKINDEX. """Parse the next block in an APKINDEX.
:param path: to the APKINDEX.tar.gz :param path: to the APKINDEX.tar.gz
@ -37,65 +49,56 @@ def parse_next_block(path: Path, lines, start):
""" """
# Parse until we hit an empty line or end of file # Parse until we hit an empty line or end of file
ret: Dict[str, Any] = {} ret: Dict[str, Any] = {}
mapping = { required_found = 0 # Count the required keys we found
"A": "arch", line = ""
"D": "depends", while len(lines):
"o": "origin", # We parse backwards for performance (pop(0) is super slow)
"P": "pkgname", line = lines.pop()
"p": "provides", if not line:
"k": "provider_priority", continue
"t": "timestamp",
"V": "version",
}
end_of_block_found = False
for i in range(start[0], len(lines)):
# Check for empty line
start[0] = i + 1
line = lines[i]
if not isinstance(line, str):
line = line.decode()
if line == "\n":
end_of_block_found = True
break
# Parse keys from the mapping # Parse keys from the mapping
for letter, key in mapping.items(): k = line[0]
if line.startswith(letter + ":"): key = apkindex_map.get(k, None)
if key in ret:
raise RuntimeError(f"Key {key} ({letter}:) specified twice" # The checksum key is always the FIRST in the block, so when we find
f" in block: {ret}, file: {path}") # it we know we're done.
ret[key] = line[2:-1] if k == 'C':
break
if key:
if key in ret:
raise RuntimeError(f"Key {key} specified twice in block: {ret}, file: {path}")
if key in required_apkindex_keys:
required_found += 1
ret[key] = line[2:]
# Format and return the block # Format and return the block
if end_of_block_found: if not len(lines) and not ret:
# Check for required keys return None
for key in ["arch", "pkgname", "version"]:
# Check for required keys
if required_found != len(required_apkindex_keys):
for key in required_apkindex_keys:
if key not in ret: if key not in ret:
raise RuntimeError(f"Missing required key '{key}' in block " raise RuntimeError(f"Missing required key '{key}' in block "
f"{ret}, file: {path}") f"{ret}, file: {path}")
raise RuntimeError(f"Expected {len(required_apkindex_keys)} required keys,"
f" but found {required_found} in block: {ret}, file: {path}")
# Format optional lists # Format optional lists
for key in ["provides", "depends"]: for key in ["provides", "depends"]:
if key in ret and ret[key] != "": if key in ret and ret[key] != "":
# Ignore all operators for now # Ignore all operators for now
values = ret[key].split(" ") values = ret[key].split(" ")
ret[key] = [] ret[key] = []
for value in values: for value in values:
for operator in [">", "=", "<", "~"]: for operator in [">", "=", "<", "~"]:
if operator in value: if operator in value:
value = value.split(operator)[0] value = value.split(operator)[0]
break break
ret[key].append(value) ret[key].append(value)
else: else:
ret[key] = [] ret[key] = []
return ret return ret
# No more blocks
elif ret != {}:
raise RuntimeError(f"Last block in {path} does not end"
" with a new line! Delete the file and"
f" try again. Last block: {ret}")
return None
def parse_add_block(ret, block, alias=None, multiple_providers=True): def parse_add_block(ret, block, alias=None, multiple_providers=True):
@ -185,19 +188,21 @@ def parse(path: Path, multiple_providers=True):
clear_cache(path) clear_cache(path)
# Read all lines # Read all lines
lines: Sequence[str]
if tarfile.is_tarfile(path): if tarfile.is_tarfile(path):
with tarfile.open(path, "r:gz") as tar: with tarfile.open(path, "r:gz") as tar:
with tar.extractfile(tar.getmember("APKINDEX")) as handle: # type:ignore[union-attr] with tar.extractfile(tar.getmember("APKINDEX")) as handle: # type:ignore[union-attr]
lines = handle.readlines() lines = handle.read().decode().splitlines()
else: else:
with path.open("r", encoding="utf-8") as handle: with path.open("r", encoding="utf-8") as handle:
lines = handle.readlines() lines = handle.read().splitlines()
# Parse the whole APKINDEX file # Parse the whole APKINDEX file
ret: Dict[str, Any] = collections.OrderedDict() ret: Dict[str, Any] = collections.OrderedDict()
start = [0] if lines[-1] == "\n":
lines.pop() # Strip the trailing newline
while True: while True:
block = parse_next_block(path, lines, start) block = parse_next_block(path, lines)
if not block: if not block:
break break
@ -236,13 +241,12 @@ def parse_blocks(path: Path):
# Parse all lines # Parse all lines
with tarfile.open(path, "r:gz") as tar: with tarfile.open(path, "r:gz") as tar:
with tar.extractfile(tar.getmember("APKINDEX")) as handle: # type:ignore[union-attr] with tar.extractfile(tar.getmember("APKINDEX")) as handle: # type:ignore[union-attr]
lines = handle.readlines() lines = handle.read().decode().splitlines()
# Parse lines into blocks # Parse lines into blocks
ret: List[str] = [] ret: List[str] = []
start = [0]
while True: while True:
block = pmb.parse.apkindex.parse_next_block(path, lines, start) block = pmb.parse.apkindex.parse_next_block(path, lines)
if not block: if not block:
return ret return ret
ret.append(block) ret.append(block)