parse: apkindex: optimise parser (MR 2252)

Rewrite the parser to be more efficient:
* Avoid using str.startswith()
* Simplify logic by parsing backwards and using lines.pop() rather than
  passing around an index.

Testing with "pmbootstrap test apkindex_parse_all" on my x86 laptop:

Before: Parsed 78537 packages from 7 APKINDEX files in 1.131 seconds
After : Parsed 78537 packages from 7 APKINDEX files in 0.609 seconds

That makes for an ~86% improvement, almost twice as fast.

Signed-off-by: Caleb Connolly <caleb@postmarketos.org>
This commit is contained in:
Caleb Connolly 2024-06-12 21:12:49 +02:00 committed by Oliver Smith
parent 62d700c3d8
commit f186ee8498
No known key found for this signature in database
GPG key ID: 5AE7F5513E0885CB

View file

@ -1,7 +1,7 @@
# Copyright 2023 Oliver Smith
# SPDX-License-Identifier: GPL-3.0-or-later
import collections
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Sequence, Union
from pmb.core.arch import Arch
from pmb.core.context import get_context
from pmb.helpers import logging
@ -12,8 +12,20 @@ import pmb.helpers.package
import pmb.helpers.repo
import pmb.parse.version
apkindex_map = {
"A": "arch",
"D": "depends",
"o": "origin",
"P": "pkgname",
"p": "provides",
"k": "provider_priority",
"t": "timestamp",
"V": "version",
}
def parse_next_block(path: Path, lines, start):
required_apkindex_keys = ["arch", "pkgname", "version"]
def parse_next_block(path: Path, lines: List[str]):
"""Parse the next block in an APKINDEX.
:param path: to the APKINDEX.tar.gz
@ -37,42 +49,40 @@ def parse_next_block(path: Path, lines, start):
"""
# Parse until we hit an empty line or end of file
ret: Dict[str, Any] = {}
mapping = {
"A": "arch",
"D": "depends",
"o": "origin",
"P": "pkgname",
"p": "provides",
"k": "provider_priority",
"t": "timestamp",
"V": "version",
}
end_of_block_found = False
for i in range(start[0], len(lines)):
# Check for empty line
start[0] = i + 1
line = lines[i]
if not isinstance(line, str):
line = line.decode()
if line == "\n":
end_of_block_found = True
break
required_found = 0 # Count the required keys we found
line = ""
while len(lines):
# We parse backwards for performance (pop(0) is super slow)
line = lines.pop()
if not line:
continue
# Parse keys from the mapping
for letter, key in mapping.items():
if line.startswith(letter + ":"):
k = line[0]
key = apkindex_map.get(k, None)
# The checksum key is always the FIRST in the block, so when we find
# it we know we're done.
if k == 'C':
break
if key:
if key in ret:
raise RuntimeError(f"Key {key} ({letter}:) specified twice"
f" in block: {ret}, file: {path}")
ret[key] = line[2:-1]
raise RuntimeError(f"Key {key} specified twice in block: {ret}, file: {path}")
if key in required_apkindex_keys:
required_found += 1
ret[key] = line[2:]
# Format and return the block
if end_of_block_found:
if not len(lines) and not ret:
return None
# Check for required keys
for key in ["arch", "pkgname", "version"]:
if required_found != len(required_apkindex_keys):
for key in required_apkindex_keys:
if key not in ret:
raise RuntimeError(f"Missing required key '{key}' in block "
f"{ret}, file: {path}")
raise RuntimeError(f"Expected {len(required_apkindex_keys)} required keys,"
f" but found {required_found} in block: {ret}, file: {path}")
# Format optional lists
for key in ["provides", "depends"]:
@ -90,13 +100,6 @@ def parse_next_block(path: Path, lines, start):
ret[key] = []
return ret
# No more blocks
elif ret != {}:
raise RuntimeError(f"Last block in {path} does not end"
" with a new line! Delete the file and"
f" try again. Last block: {ret}")
return None
def parse_add_block(ret, block, alias=None, multiple_providers=True):
"""Add one block to the return dictionary of parse().
@ -185,19 +188,21 @@ def parse(path: Path, multiple_providers=True):
clear_cache(path)
# Read all lines
lines: Sequence[str]
if tarfile.is_tarfile(path):
with tarfile.open(path, "r:gz") as tar:
with tar.extractfile(tar.getmember("APKINDEX")) as handle: # type:ignore[union-attr]
lines = handle.readlines()
lines = handle.read().decode().splitlines()
else:
with path.open("r", encoding="utf-8") as handle:
lines = handle.readlines()
lines = handle.read().splitlines()
# Parse the whole APKINDEX file
ret: Dict[str, Any] = collections.OrderedDict()
start = [0]
if lines[-1] == "\n":
lines.pop() # Strip the trailing newline
while True:
block = parse_next_block(path, lines, start)
block = parse_next_block(path, lines)
if not block:
break
@ -236,13 +241,12 @@ def parse_blocks(path: Path):
# Parse all lines
with tarfile.open(path, "r:gz") as tar:
with tar.extractfile(tar.getmember("APKINDEX")) as handle: # type:ignore[union-attr]
lines = handle.readlines()
lines = handle.read().decode().splitlines()
# Parse lines into blocks
ret: List[str] = []
start = [0]
while True:
block = pmb.parse.apkindex.parse_next_block(path, lines, start)
block = pmb.parse.apkindex.parse_next_block(path, lines)
if not block:
return ret
ret.append(block)