perf(anki): skip unchanged files using file hashes

This commit is contained in:
arne314
2024-12-27 22:11:05 +01:00
parent c3331683e5
commit 348a889521
7 changed files with 78 additions and 24 deletions

View File

@@ -14,6 +14,7 @@ class AnkiConnectError(Exception):
class AnkiConnectApi:
url: str
api_key: str
semaphore: asyncio.Semaphore
def __init__(self, url: str, api_key: str):
self.url = url
@@ -30,7 +31,8 @@ class AnkiConnectApi:
else:
update.append(card)
print(f"Pushing {len(add)} new flashcards and {len(update)} updated flashcards to Anki...")
await asyncio.gather(self._add(add), self._update(update))
await self._add(add)
await self._update(update)
async def _request_api(self, action, **params):
async with aiohttp.ClientSession() as session:

View File

@@ -1,24 +1,29 @@
import os.path
import hashlib
from pathlib import Path
from typing import List
import tree_sitter
class FileHandler:
file_path: str
file_path: Path
file_content: List[str]
def __init__(self, path):
def __init__(self, path: Path):
self.file_path = path
self.read()
@property
def directory_path(self) -> str:
return os.path.dirname(self.file_path)
def directory_path(self) -> Path:
return self.file_path.parent
def get_bytes(self) -> bytes:
return bytes("".join(self.file_content), encoding="utf-8")
def get_file_hash(self) -> str:
return hashlib.md5("".join(self.file_content).encode(), usedforsecurity=False).hexdigest()
def get_node_content(self, node: tree_sitter.Node, remove_outer=False):
content = "".join(
self.file_content[node.start_point.row:node.end_point.row + 1]
@@ -37,9 +42,9 @@ class FileHandler:
self.file_content = new_lines
def read(self):
with open(self.file_path, encoding="utf-8") as f:
with self.file_path.open(encoding="utf-8") as f:
self.file_content = f.readlines()
def write(self):
with open(self.file_path, "w", encoding="utf-8") as f:
with self.file_path.open("w", encoding="utf-8") as f:
f.writelines(self.file_content)

View File

@@ -1,24 +1,24 @@
import asyncio
import glob
import os
from typing_extensions import Annotated
import typer
from anki.anki_api import AnkiConnectApi
from anki.file_handler import FileHandler
from anki.parser import FlashcardParser
from anki.typst_compiler import TypstCompiler
cli = typer.Typer(name="typstar-anki")
async def export_flashcards(root_dir, typst_cmd, anki_url, anki_key):
async def export_flashcards(root_dir, clear_cache, typst_cmd, anki_url, anki_key):
parser = FlashcardParser()
compiler = TypstCompiler(root_dir, typst_cmd)
api = AnkiConnectApi(anki_url, anki_key)
# parse flashcards
if clear_cache:
parser.clear_file_hashes()
flashcards = parser.parse_directory(root_dir)
# async typst compilation
@@ -30,6 +30,7 @@ async def export_flashcards(root_dir, typst_cmd, anki_url, anki_key):
finally:
# write id updates to files
parser.update_ids_in_source()
parser.save_file_hashes()
print("Done")
@@ -37,10 +38,12 @@ async def export_flashcards(root_dir, typst_cmd, anki_url, anki_key):
def cmd(root_dir: Annotated[
str, typer.Option(help="Directory scanned for flashcards and passed over to typst compile command")] = os.getcwd(),
typst_cmd: Annotated[str, typer.Option(help="Typst command used for flashcard compilation")] = "typst",
clear_cache: Annotated[bool, typer.Option(help="Clear stored file hashes and force compilation and "
"push of all flashcards (e.g. on preamble change)")] = False,
anki_url: Annotated[str, typer.Option(help="Url for Anki-Connect")] = "http://127.0.0.1:8765",
anki_key: Annotated[str, typer.Option(help="Api key for Anki-Connect")] = None,
):
asyncio.run(export_flashcards(root_dir, typst_cmd, anki_url, anki_key))
asyncio.run(export_flashcards(root_dir, clear_cache, typst_cmd, anki_url, anki_key))
def main():

View File

@@ -1,9 +1,11 @@
import glob
import os.path
import json
from functools import cache
from pathlib import Path
from typing import List
import appdirs
import tree_sitter
from tree_sitter_language_pack import get_language, get_parser
@@ -37,14 +39,17 @@ class FlashcardParser:
flashcard_query: tree_sitter.Query
file_handlers: List[tuple[FileHandler, List[Flashcard]]]
file_hashes: dict[str, str]
file_hashes_store_path: Path = Path(appdirs.user_state_dir("typstar") + "/file_hashes.json")
def __init__(self):
self.typst_language = get_language("typst")
self.typst_parser = get_parser("typst")
self.flashcard_query = self.typst_language.query(ts_flashcard_query)
self.file_handlers = []
self._load_file_hashes()
def parse_file(self, file: FileHandler, preamble: str) -> List[Flashcard]:
def _parse_file(self, file: FileHandler, preamble: str) -> List[Flashcard]:
cards = []
tree = self.typst_parser.parse(file.get_bytes(), encoding="utf8")
captures = self.flashcard_query.captures(tree.root_node)
@@ -73,27 +78,51 @@ class FlashcardParser:
def parse_directory(self, root_dir):
print(f"Parsing flashcards in {root_dir}...")
root_dir = Path(root_dir)
preambles = {}
flashcards = []
@cache
def get_preamble(path) -> str | None:
while len(path) > len(root_dir):
def get_preamble(path: Path) -> str | None:
while path != root_dir:
if preamble := preambles.get(path):
return preamble
path = os.path.dirname(path)
path = path.parent
for file in sorted(glob.glob(f"{root_dir}/**/**.typ", include_hidden=True, recursive=True)):
if os.path.basename(file) == ".anki.typ":
with open(file, encoding="utf-8") as f:
preambles[os.path.dirname(file)] = f.read()
file = Path(file)
if file.name == ".anki.typ":
preambles[file.parent] = file.read_text(encoding="utf-8")
continue
fh = FileHandler(file)
cards = self.parse_file(fh, get_preamble(os.path.dirname(file)))
self.file_handlers.append((fh, cards))
flashcards.extend(cards)
if self._hash_changed(fh):
cards = self._parse_file(fh, get_preamble(file.parent))
self.file_handlers.append((fh, cards))
flashcards.extend(cards)
return flashcards
def _hash_changed(self, file: FileHandler) -> bool:
file_hash = file.get_file_hash()
cached = self.file_hashes.get(str(file.file_path))
self.file_hashes[str(file.file_path)] = file_hash
return file_hash != cached
def _load_file_hashes(self):
self.file_hashes_store_path.parent.mkdir(parents=True, exist_ok=True)
self.file_hashes_store_path.touch()
content = self.file_hashes_store_path.read_text()
if content:
self.file_hashes = json.loads(content)
else:
self.file_hashes = {}
def save_file_hashes(self):
self.file_hashes_store_path.write_text(json.dumps(self.file_hashes))
def clear_file_hashes(self):
self.file_hashes = {}
self.save_file_hashes()
def update_ids_in_source(self):
print("Updating ids in source...")
for fh, cards in self.file_handlers:
@@ -104,3 +133,4 @@ class FlashcardParser:
file_updated = True
if file_updated:
fh.write()
self.file_hashes[str(fh.file_path)] = fh.get_file_hash()

View File

@@ -1,6 +1,8 @@
import asyncio
import os
import random
from pathlib import Path
from typing import List
from .flashcard import Flashcard
@@ -31,7 +33,7 @@ class TypstCompiler:
self.typst_root_dir = typst_root_dir
self.max_processes = round(os.cpu_count() * 1.5)
async def _compile(self, src: str, directory: str) -> bytes:
async def _compile(self, src: str, directory: Path) -> bytes:
tmp_path = f"{directory}/tmp_{random.randint(1, 1000000000)}.typ"
with open(tmp_path, "w", encoding="utf-8") as f:
f.write(src)