From 348a88952160f5684d6774ddec4f8f0dbc0c0b37 Mon Sep 17 00:00:00 2001 From: arne314 <73391160+arne314@users.noreply.github.com> Date: Fri, 27 Dec 2024 22:11:05 +0100 Subject: [PATCH] perf(anki): skip unchanged files using file hashes --- pyproject.toml | 1 + src/anki/anki_api.py | 4 ++- src/anki/file_handler.py | 19 +++++++++----- src/anki/main.py | 11 +++++--- src/anki/parser.py | 52 ++++++++++++++++++++++++++++++-------- src/anki/typst_compiler.py | 4 ++- uv.lock | 11 ++++++++ 7 files changed, 78 insertions(+), 24 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 88e7b53..ebded34 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ readme = "README.md" requires-python = ">=3.11.10" dependencies = [ "aiohttp>=3.11.11", + "appdirs>=1.4.4", "tree-sitter-language-pack>=0.2.0", "typer>=0.15.1", ] diff --git a/src/anki/anki_api.py b/src/anki/anki_api.py index 50ec626..a14e2f5 100644 --- a/src/anki/anki_api.py +++ b/src/anki/anki_api.py @@ -14,6 +14,7 @@ class AnkiConnectError(Exception): class AnkiConnectApi: url: str api_key: str + semaphore: asyncio.Semaphore def __init__(self, url: str, api_key: str): self.url = url @@ -30,7 +31,8 @@ class AnkiConnectApi: else: update.append(card) print(f"Pushing {len(add)} new flashcards and {len(update)} updated flashcards to Anki...") - await asyncio.gather(self._add(add), self._update(update)) + await self._add(add) + await self._update(update) async def _request_api(self, action, **params): async with aiohttp.ClientSession() as session: diff --git a/src/anki/file_handler.py b/src/anki/file_handler.py index bde334b..07726b6 100644 --- a/src/anki/file_handler.py +++ b/src/anki/file_handler.py @@ -1,24 +1,29 @@ -import os.path +import hashlib + +from pathlib import Path from typing import List import tree_sitter class FileHandler: - file_path: str + file_path: Path file_content: List[str] - def __init__(self, path): + def __init__(self, path: Path): self.file_path = path self.read() @property - def directory_path(self) -> str: - return os.path.dirname(self.file_path) + def directory_path(self) -> Path: + return self.file_path.parent def get_bytes(self) -> bytes: return bytes("".join(self.file_content), encoding="utf-8") + def get_file_hash(self) -> str: + return hashlib.md5("".join(self.file_content).encode(), usedforsecurity=False).hexdigest() + def get_node_content(self, node: tree_sitter.Node, remove_outer=False): content = "".join( self.file_content[node.start_point.row:node.end_point.row + 1] @@ -37,9 +42,9 @@ class FileHandler: self.file_content = new_lines def read(self): - with open(self.file_path, encoding="utf-8") as f: + with self.file_path.open(encoding="utf-8") as f: self.file_content = f.readlines() def write(self): - with open(self.file_path, "w", encoding="utf-8") as f: + with self.file_path.open("w", encoding="utf-8") as f: f.writelines(self.file_content) diff --git a/src/anki/main.py b/src/anki/main.py index 1feae5f..10b16ff 100644 --- a/src/anki/main.py +++ b/src/anki/main.py @@ -1,24 +1,24 @@ import asyncio -import glob import os from typing_extensions import Annotated import typer from anki.anki_api import AnkiConnectApi -from anki.file_handler import FileHandler from anki.parser import FlashcardParser from anki.typst_compiler import TypstCompiler cli = typer.Typer(name="typstar-anki") -async def export_flashcards(root_dir, typst_cmd, anki_url, anki_key): +async def export_flashcards(root_dir, clear_cache, typst_cmd, anki_url, anki_key): parser = FlashcardParser() compiler = TypstCompiler(root_dir, typst_cmd) api = AnkiConnectApi(anki_url, anki_key) # parse flashcards + if clear_cache: + parser.clear_file_hashes() flashcards = parser.parse_directory(root_dir) # async typst compilation @@ -30,6 +30,7 @@ async def export_flashcards(root_dir, typst_cmd, anki_url, anki_key): finally: # write id updates to files parser.update_ids_in_source() + parser.save_file_hashes() print("Done") @@ -37,10 +38,12 @@ async def export_flashcards(root_dir, typst_cmd, anki_url, anki_key): def cmd(root_dir: Annotated[ str, typer.Option(help="Directory scanned for flashcards and passed over to typst compile command")] = os.getcwd(), typst_cmd: Annotated[str, typer.Option(help="Typst command used for flashcard compilation")] = "typst", + clear_cache: Annotated[bool, typer.Option(help="Clear stored file hashes and force compilation and " + "push of all flashcards (e.g. on preamble change)")] = False, anki_url: Annotated[str, typer.Option(help="Url for Anki-Connect")] = "http://127.0.0.1:8765", anki_key: Annotated[str, typer.Option(help="Api key for Anki-Connect")] = None, ): - asyncio.run(export_flashcards(root_dir, typst_cmd, anki_url, anki_key)) + asyncio.run(export_flashcards(root_dir, clear_cache, typst_cmd, anki_url, anki_key)) def main(): diff --git a/src/anki/parser.py b/src/anki/parser.py index 1c0adc9..2d1a2b4 100644 --- a/src/anki/parser.py +++ b/src/anki/parser.py @@ -1,9 +1,11 @@ import glob -import os.path +import json from functools import cache +from pathlib import Path from typing import List +import appdirs import tree_sitter from tree_sitter_language_pack import get_language, get_parser @@ -37,14 +39,17 @@ class FlashcardParser: flashcard_query: tree_sitter.Query file_handlers: List[tuple[FileHandler, List[Flashcard]]] + file_hashes: dict[str, str] + file_hashes_store_path: Path = Path(appdirs.user_state_dir("typstar") + "/file_hashes.json") def __init__(self): self.typst_language = get_language("typst") self.typst_parser = get_parser("typst") self.flashcard_query = self.typst_language.query(ts_flashcard_query) self.file_handlers = [] + self._load_file_hashes() - def parse_file(self, file: FileHandler, preamble: str) -> List[Flashcard]: + def _parse_file(self, file: FileHandler, preamble: str) -> List[Flashcard]: cards = [] tree = self.typst_parser.parse(file.get_bytes(), encoding="utf8") captures = self.flashcard_query.captures(tree.root_node) @@ -73,27 +78,51 @@ class FlashcardParser: def parse_directory(self, root_dir): print(f"Parsing flashcards in {root_dir}...") + root_dir = Path(root_dir) preambles = {} flashcards = [] @cache - def get_preamble(path) -> str | None: - while len(path) > len(root_dir): + def get_preamble(path: Path) -> str | None: + while path != root_dir: if preamble := preambles.get(path): return preamble - path = os.path.dirname(path) + path = path.parent for file in sorted(glob.glob(f"{root_dir}/**/**.typ", include_hidden=True, recursive=True)): - if os.path.basename(file) == ".anki.typ": - with open(file, encoding="utf-8") as f: - preambles[os.path.dirname(file)] = f.read() + file = Path(file) + if file.name == ".anki.typ": + preambles[file.parent] = file.read_text(encoding="utf-8") continue fh = FileHandler(file) - cards = self.parse_file(fh, get_preamble(os.path.dirname(file))) - self.file_handlers.append((fh, cards)) - flashcards.extend(cards) + if self._hash_changed(fh): + cards = self._parse_file(fh, get_preamble(file.parent)) + self.file_handlers.append((fh, cards)) + flashcards.extend(cards) return flashcards + def _hash_changed(self, file: FileHandler) -> bool: + file_hash = file.get_file_hash() + cached = self.file_hashes.get(str(file.file_path)) + self.file_hashes[str(file.file_path)] = file_hash + return file_hash != cached + + def _load_file_hashes(self): + self.file_hashes_store_path.parent.mkdir(parents=True, exist_ok=True) + self.file_hashes_store_path.touch() + content = self.file_hashes_store_path.read_text() + if content: + self.file_hashes = json.loads(content) + else: + self.file_hashes = {} + + def save_file_hashes(self): + self.file_hashes_store_path.write_text(json.dumps(self.file_hashes)) + + def clear_file_hashes(self): + self.file_hashes = {} + self.save_file_hashes() + def update_ids_in_source(self): print("Updating ids in source...") for fh, cards in self.file_handlers: @@ -104,3 +133,4 @@ class FlashcardParser: file_updated = True if file_updated: fh.write() + self.file_hashes[str(fh.file_path)] = fh.get_file_hash() diff --git a/src/anki/typst_compiler.py b/src/anki/typst_compiler.py index 2cc5abe..505e640 100644 --- a/src/anki/typst_compiler.py +++ b/src/anki/typst_compiler.py @@ -1,6 +1,8 @@ import asyncio import os import random + +from pathlib import Path from typing import List from .flashcard import Flashcard @@ -31,7 +33,7 @@ class TypstCompiler: self.typst_root_dir = typst_root_dir self.max_processes = round(os.cpu_count() * 1.5) - async def _compile(self, src: str, directory: str) -> bytes: + async def _compile(self, src: str, directory: Path) -> bytes: tmp_path = f"{directory}/tmp_{random.randint(1, 1000000000)}.typ" with open(tmp_path, "w", encoding="utf-8") as f: f.write(src) diff --git a/uv.lock b/uv.lock index 1edfd80..e993f7e 100644 --- a/uv.lock +++ b/uv.lock @@ -84,6 +84,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ec/6a/bc7e17a3e87a2985d3e8f4da4cd0f481060eb78fb08596c42be62c90a4d9/aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5", size = 7597 }, ] +[[package]] +name = "appdirs" +version = "1.4.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d7/d8/05696357e0311f5b5c316d7b95f46c669dd9c15aaeecbb48c7d0aeb88c40/appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41", size = 13470 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128", size = 9566 }, +] + [[package]] name = "attrs" version = "24.3.0" @@ -507,6 +516,7 @@ version = "1.0.0" source = { editable = "." } dependencies = [ { name = "aiohttp" }, + { name = "appdirs" }, { name = "tree-sitter-language-pack" }, { name = "typer" }, ] @@ -514,6 +524,7 @@ dependencies = [ [package.metadata] requires-dist = [ { name = "aiohttp", specifier = ">=3.11.11" }, + { name = "appdirs", specifier = ">=1.4.4" }, { name = "tree-sitter-language-pack", specifier = ">=0.2.0" }, { name = "typer", specifier = ">=0.15.1" }, ]