perf(anki): skip unchanged files using file hashes

This commit is contained in:
arne314
2024-12-27 22:11:05 +01:00
parent c3331683e5
commit 348a889521
7 changed files with 78 additions and 24 deletions

View File

@@ -13,6 +13,7 @@ readme = "README.md"
requires-python = ">=3.11.10" requires-python = ">=3.11.10"
dependencies = [ dependencies = [
"aiohttp>=3.11.11", "aiohttp>=3.11.11",
"appdirs>=1.4.4",
"tree-sitter-language-pack>=0.2.0", "tree-sitter-language-pack>=0.2.0",
"typer>=0.15.1", "typer>=0.15.1",
] ]

View File

@@ -14,6 +14,7 @@ class AnkiConnectError(Exception):
class AnkiConnectApi: class AnkiConnectApi:
url: str url: str
api_key: str api_key: str
semaphore: asyncio.Semaphore
def __init__(self, url: str, api_key: str): def __init__(self, url: str, api_key: str):
self.url = url self.url = url
@@ -30,7 +31,8 @@ class AnkiConnectApi:
else: else:
update.append(card) update.append(card)
print(f"Pushing {len(add)} new flashcards and {len(update)} updated flashcards to Anki...") print(f"Pushing {len(add)} new flashcards and {len(update)} updated flashcards to Anki...")
await asyncio.gather(self._add(add), self._update(update)) await self._add(add)
await self._update(update)
async def _request_api(self, action, **params): async def _request_api(self, action, **params):
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:

View File

@@ -1,24 +1,29 @@
import os.path import hashlib
from pathlib import Path
from typing import List from typing import List
import tree_sitter import tree_sitter
class FileHandler: class FileHandler:
file_path: str file_path: Path
file_content: List[str] file_content: List[str]
def __init__(self, path): def __init__(self, path: Path):
self.file_path = path self.file_path = path
self.read() self.read()
@property @property
def directory_path(self) -> str: def directory_path(self) -> Path:
return os.path.dirname(self.file_path) return self.file_path.parent
def get_bytes(self) -> bytes: def get_bytes(self) -> bytes:
return bytes("".join(self.file_content), encoding="utf-8") return bytes("".join(self.file_content), encoding="utf-8")
def get_file_hash(self) -> str:
return hashlib.md5("".join(self.file_content).encode(), usedforsecurity=False).hexdigest()
def get_node_content(self, node: tree_sitter.Node, remove_outer=False): def get_node_content(self, node: tree_sitter.Node, remove_outer=False):
content = "".join( content = "".join(
self.file_content[node.start_point.row:node.end_point.row + 1] self.file_content[node.start_point.row:node.end_point.row + 1]
@@ -37,9 +42,9 @@ class FileHandler:
self.file_content = new_lines self.file_content = new_lines
def read(self): def read(self):
with open(self.file_path, encoding="utf-8") as f: with self.file_path.open(encoding="utf-8") as f:
self.file_content = f.readlines() self.file_content = f.readlines()
def write(self): def write(self):
with open(self.file_path, "w", encoding="utf-8") as f: with self.file_path.open("w", encoding="utf-8") as f:
f.writelines(self.file_content) f.writelines(self.file_content)

View File

@@ -1,24 +1,24 @@
import asyncio import asyncio
import glob
import os import os
from typing_extensions import Annotated from typing_extensions import Annotated
import typer import typer
from anki.anki_api import AnkiConnectApi from anki.anki_api import AnkiConnectApi
from anki.file_handler import FileHandler
from anki.parser import FlashcardParser from anki.parser import FlashcardParser
from anki.typst_compiler import TypstCompiler from anki.typst_compiler import TypstCompiler
cli = typer.Typer(name="typstar-anki") cli = typer.Typer(name="typstar-anki")
async def export_flashcards(root_dir, typst_cmd, anki_url, anki_key): async def export_flashcards(root_dir, clear_cache, typst_cmd, anki_url, anki_key):
parser = FlashcardParser() parser = FlashcardParser()
compiler = TypstCompiler(root_dir, typst_cmd) compiler = TypstCompiler(root_dir, typst_cmd)
api = AnkiConnectApi(anki_url, anki_key) api = AnkiConnectApi(anki_url, anki_key)
# parse flashcards # parse flashcards
if clear_cache:
parser.clear_file_hashes()
flashcards = parser.parse_directory(root_dir) flashcards = parser.parse_directory(root_dir)
# async typst compilation # async typst compilation
@@ -30,6 +30,7 @@ async def export_flashcards(root_dir, typst_cmd, anki_url, anki_key):
finally: finally:
# write id updates to files # write id updates to files
parser.update_ids_in_source() parser.update_ids_in_source()
parser.save_file_hashes()
print("Done") print("Done")
@@ -37,10 +38,12 @@ async def export_flashcards(root_dir, typst_cmd, anki_url, anki_key):
def cmd(root_dir: Annotated[ def cmd(root_dir: Annotated[
str, typer.Option(help="Directory scanned for flashcards and passed over to typst compile command")] = os.getcwd(), str, typer.Option(help="Directory scanned for flashcards and passed over to typst compile command")] = os.getcwd(),
typst_cmd: Annotated[str, typer.Option(help="Typst command used for flashcard compilation")] = "typst", typst_cmd: Annotated[str, typer.Option(help="Typst command used for flashcard compilation")] = "typst",
clear_cache: Annotated[bool, typer.Option(help="Clear stored file hashes and force compilation and "
"push of all flashcards (e.g. on preamble change)")] = False,
anki_url: Annotated[str, typer.Option(help="Url for Anki-Connect")] = "http://127.0.0.1:8765", anki_url: Annotated[str, typer.Option(help="Url for Anki-Connect")] = "http://127.0.0.1:8765",
anki_key: Annotated[str, typer.Option(help="Api key for Anki-Connect")] = None, anki_key: Annotated[str, typer.Option(help="Api key for Anki-Connect")] = None,
): ):
asyncio.run(export_flashcards(root_dir, typst_cmd, anki_url, anki_key)) asyncio.run(export_flashcards(root_dir, clear_cache, typst_cmd, anki_url, anki_key))
def main(): def main():

View File

@@ -1,9 +1,11 @@
import glob import glob
import os.path import json
from functools import cache from functools import cache
from pathlib import Path
from typing import List from typing import List
import appdirs
import tree_sitter import tree_sitter
from tree_sitter_language_pack import get_language, get_parser from tree_sitter_language_pack import get_language, get_parser
@@ -37,14 +39,17 @@ class FlashcardParser:
flashcard_query: tree_sitter.Query flashcard_query: tree_sitter.Query
file_handlers: List[tuple[FileHandler, List[Flashcard]]] file_handlers: List[tuple[FileHandler, List[Flashcard]]]
file_hashes: dict[str, str]
file_hashes_store_path: Path = Path(appdirs.user_state_dir("typstar") + "/file_hashes.json")
def __init__(self): def __init__(self):
self.typst_language = get_language("typst") self.typst_language = get_language("typst")
self.typst_parser = get_parser("typst") self.typst_parser = get_parser("typst")
self.flashcard_query = self.typst_language.query(ts_flashcard_query) self.flashcard_query = self.typst_language.query(ts_flashcard_query)
self.file_handlers = [] self.file_handlers = []
self._load_file_hashes()
def parse_file(self, file: FileHandler, preamble: str) -> List[Flashcard]: def _parse_file(self, file: FileHandler, preamble: str) -> List[Flashcard]:
cards = [] cards = []
tree = self.typst_parser.parse(file.get_bytes(), encoding="utf8") tree = self.typst_parser.parse(file.get_bytes(), encoding="utf8")
captures = self.flashcard_query.captures(tree.root_node) captures = self.flashcard_query.captures(tree.root_node)
@@ -73,27 +78,51 @@ class FlashcardParser:
def parse_directory(self, root_dir): def parse_directory(self, root_dir):
print(f"Parsing flashcards in {root_dir}...") print(f"Parsing flashcards in {root_dir}...")
root_dir = Path(root_dir)
preambles = {} preambles = {}
flashcards = [] flashcards = []
@cache @cache
def get_preamble(path) -> str | None: def get_preamble(path: Path) -> str | None:
while len(path) > len(root_dir): while path != root_dir:
if preamble := preambles.get(path): if preamble := preambles.get(path):
return preamble return preamble
path = os.path.dirname(path) path = path.parent
for file in sorted(glob.glob(f"{root_dir}/**/**.typ", include_hidden=True, recursive=True)): for file in sorted(glob.glob(f"{root_dir}/**/**.typ", include_hidden=True, recursive=True)):
if os.path.basename(file) == ".anki.typ": file = Path(file)
with open(file, encoding="utf-8") as f: if file.name == ".anki.typ":
preambles[os.path.dirname(file)] = f.read() preambles[file.parent] = file.read_text(encoding="utf-8")
continue continue
fh = FileHandler(file) fh = FileHandler(file)
cards = self.parse_file(fh, get_preamble(os.path.dirname(file))) if self._hash_changed(fh):
self.file_handlers.append((fh, cards)) cards = self._parse_file(fh, get_preamble(file.parent))
flashcards.extend(cards) self.file_handlers.append((fh, cards))
flashcards.extend(cards)
return flashcards return flashcards
def _hash_changed(self, file: FileHandler) -> bool:
file_hash = file.get_file_hash()
cached = self.file_hashes.get(str(file.file_path))
self.file_hashes[str(file.file_path)] = file_hash
return file_hash != cached
def _load_file_hashes(self):
self.file_hashes_store_path.parent.mkdir(parents=True, exist_ok=True)
self.file_hashes_store_path.touch()
content = self.file_hashes_store_path.read_text()
if content:
self.file_hashes = json.loads(content)
else:
self.file_hashes = {}
def save_file_hashes(self):
self.file_hashes_store_path.write_text(json.dumps(self.file_hashes))
def clear_file_hashes(self):
self.file_hashes = {}
self.save_file_hashes()
def update_ids_in_source(self): def update_ids_in_source(self):
print("Updating ids in source...") print("Updating ids in source...")
for fh, cards in self.file_handlers: for fh, cards in self.file_handlers:
@@ -104,3 +133,4 @@ class FlashcardParser:
file_updated = True file_updated = True
if file_updated: if file_updated:
fh.write() fh.write()
self.file_hashes[str(fh.file_path)] = fh.get_file_hash()

View File

@@ -1,6 +1,8 @@
import asyncio import asyncio
import os import os
import random import random
from pathlib import Path
from typing import List from typing import List
from .flashcard import Flashcard from .flashcard import Flashcard
@@ -31,7 +33,7 @@ class TypstCompiler:
self.typst_root_dir = typst_root_dir self.typst_root_dir = typst_root_dir
self.max_processes = round(os.cpu_count() * 1.5) self.max_processes = round(os.cpu_count() * 1.5)
async def _compile(self, src: str, directory: str) -> bytes: async def _compile(self, src: str, directory: Path) -> bytes:
tmp_path = f"{directory}/tmp_{random.randint(1, 1000000000)}.typ" tmp_path = f"{directory}/tmp_{random.randint(1, 1000000000)}.typ"
with open(tmp_path, "w", encoding="utf-8") as f: with open(tmp_path, "w", encoding="utf-8") as f:
f.write(src) f.write(src)

11
uv.lock generated
View File

@@ -84,6 +84,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/ec/6a/bc7e17a3e87a2985d3e8f4da4cd0f481060eb78fb08596c42be62c90a4d9/aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5", size = 7597 }, { url = "https://files.pythonhosted.org/packages/ec/6a/bc7e17a3e87a2985d3e8f4da4cd0f481060eb78fb08596c42be62c90a4d9/aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5", size = 7597 },
] ]
[[package]]
name = "appdirs"
version = "1.4.4"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d7/d8/05696357e0311f5b5c316d7b95f46c669dd9c15aaeecbb48c7d0aeb88c40/appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41", size = 13470 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128", size = 9566 },
]
[[package]] [[package]]
name = "attrs" name = "attrs"
version = "24.3.0" version = "24.3.0"
@@ -507,6 +516,7 @@ version = "1.0.0"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "aiohttp" }, { name = "aiohttp" },
{ name = "appdirs" },
{ name = "tree-sitter-language-pack" }, { name = "tree-sitter-language-pack" },
{ name = "typer" }, { name = "typer" },
] ]
@@ -514,6 +524,7 @@ dependencies = [
[package.metadata] [package.metadata]
requires-dist = [ requires-dist = [
{ name = "aiohttp", specifier = ">=3.11.11" }, { name = "aiohttp", specifier = ">=3.11.11" },
{ name = "appdirs", specifier = ">=1.4.4" },
{ name = "tree-sitter-language-pack", specifier = ">=0.2.0" }, { name = "tree-sitter-language-pack", specifier = ">=0.2.0" },
{ name = "typer", specifier = ">=0.15.1" }, { name = "typer", specifier = ">=0.15.1" },
] ]