eXpressionist
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 40 additions & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 6 additions & 1 deletion b/‎.gitignore‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎AGENTS.md‎
Lines changed: 44 additions & 0 deletions b/‎AGENTS.md‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 3 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎subs_diff/__init__.py‎
Lines changed: 4 additions & 4 deletions b/‎subs_diff/__init__.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎subs_diff/__main__.py‎
Lines changed: 1 addition & 0 deletions b/‎subs_diff/__main__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎subs_diff/align.py‎
Lines changed: 10 additions & 7 deletions b/‎subs_diff/align.py‎
Lines changed: 10 additions & 7 deletions
diff --git a/‎subs_diff/checkpoint.py‎
Lines changed: 50 additions & 0 deletions b/‎subs_diff/checkpoint.py‎
Lines changed: 50 additions & 0 deletions
@@ -0,0 +1,40 @@
+name: CI
+
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+
+jobs:
+  test:
+    name: Python ${{ matrix.python-version }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10", "3.11", "3.12"]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: pip
+
+      - name: Install
+        run: python -m pip install -e ".[dev]"
+
+      - name: Test
+        run: pytest -q
+
+      - name: Ruff
+        run: ruff check .
+
+      - name: Black
+        run: black --check .
+
+      - name: Mypy
+        run: mypy subs_diff
@@ -45,6 +45,11 @@ coverage.xml
 *.py,cover
 .hypothesis/
 .pytest_cache/
+.pytest_tmp/
+.pytest-run/
+test-tmp/
+.codex-pip-tmp/
+.codex-pip-cache/
 
 # Translations
 *.mo
@@ -93,4 +98,4 @@ Thumbs.db
 .dual-graph/
 
 # Subtitles
-*.srt
+*.srt
@@ -0,0 +1,44 @@
+# Repository Guidelines
+
+## Project Structure & Module Organization
+Core code lives in `subs_diff/`:
+- `cli.py` and `__main__.py` provide the CLI entry points.
+- `parser.py`, `align.py`, `heuristics.py`, `segments.py`, and `llm.py` implement parsing, matching, scoring, long-segment checks, and LLM verification.
+- `report.py` and `reporter.py` generate JSON/HTML reports.
+- Shared dataclasses and config types are in `types.py` and `config.py`.
+
+Tests live in `tests/` and follow the same feature split (for example, `tests/test_align.py`, `tests/test_cli_filters.py`).
+
+## Build, Test, and Development Commands
+- `pip install -e .` installs the package in editable mode.
+- `pip install -e ".[dev]"` installs development tools (`pytest`, `ruff`, `black`, `mypy`).
+- `python -m subs_diff compare --stt A.srt --ref B.srt --out report.json` runs the main compare flow.
+- `pytest -q` runs the test suite.
+- `pytest --cov=subs_diff --cov-report=html` runs tests with coverage output in `htmlcov/`.
+- `ruff check .` runs lint checks.
+- `black .` formats code.
+- `mypy subs_diff` runs strict type checking.
+
+## Coding Style & Naming Conventions
+- Python 3.10+ codebase; keep compatibility with versions listed in `pyproject.toml`.
+- Use 4-space indentation and max line length `100` (Black/Ruff config).
+- Use snake_case for functions/variables/modules; PascalCase for dataclasses/types.
+- Prefer explicit type annotations; `mypy` is configured with `strict = true`.
+- Keep modules focused; add new logic to existing domain modules before creating new top-level files.
+
+## Testing Guidelines
+- Framework: `pytest` (`tests/`, files named `test_*.py`).
+- Add tests for every behavior change, especially CLI flags and alignment heuristics.
+- Name tests by behavior, e.g. `test_compare_resumes_from_checkpoint`.
+- For bug fixes, add a regression test that fails before the fix.
+
+## Commit & Pull Request Guidelines
+Current history uses short, direct subjects (for example, `long segments detection`, `Update .gitignore`). Follow that style, but make subjects specific and actionable.
+
+- Commit message format: short imperative subject, optionally with scope (e.g. `align: tighten time window filter`).
+- PRs should include: purpose, key changes, test evidence (`pytest`/lint/type-check output), and sample CLI command(s) for manual verification.
+- Link related issues and attach report artifacts/screenshots when output format changes.
+
+## Security & Configuration Tips
+- Do not commit API keys or local config files.
+- Prefer CLI/config storage for secrets (`subs_diff config set ...`) and keep generated reports/debug logs out of commits unless needed for fixtures.
@@ -40,14 +40,12 @@ dev = [
     "black>=23.0.0",
     "ruff>=0.1.0",
     "mypy>=1.5.0",
+    "types-tqdm>=4.67.0",
 ]
 
 [project.scripts]
 subs-diff = "subs_diff.cli:main"
 
-[project.entry-points."console_scripts"]
-subs-diff = "subs_diff.cli:main"
-
 [tool.setuptools.packages.find]
 where = ["."]
 include = ["subs_diff*"]
 
@@ -3,14 +3,14 @@
 __version__ = "0.1.0"
 
 from subs_diff.types import (
-    Segment,
     Candidate,
-    Issue,
-    Severity,
     Category,
+    Config,
+    Issue,
     LLMVerdict,
     Report,
-    Config,
+    Segment,
+    Severity,
 )
 
 __all__ = [
 
@@ -1,6 +1,7 @@
 """Entry point для python -m subs_diff."""
 
 import sys
+
 from subs_diff.cli import main
 
 if __name__ == "__main__":
 
@@ -1,10 +1,9 @@
 """Выравнивание сегментов и merge операций."""
 
 from dataclasses import dataclass
-from typing import Iterable
 
-from subs_diff.types import Segment, MergedSegment, Candidate, SimilarityMetrics
-from subs_diff.heuristics import compute_similarity, is_candidate, RareTokenDetector
+from subs_diff.heuristics import compute_similarity, is_candidate
+from subs_diff.types import Candidate, MergedSegment, Segment, SimilarityMetrics
 
 
 @dataclass
@@ -201,7 +200,7 @@ def align_segments(
             merged_a, merged_b, metrics = best_match
             aligned_pairs.append((merged_a, merged_b))
             if is_candidate(
-                metrics, 
+                metrics,
                 min_score=min_score,
                 a_tokens=merged_a.tokens,
                 b_tokens=merged_b.tokens,
@@ -268,7 +267,7 @@ def align_segments(
             merged_a = merge_segments([best_temporal_a])
             metrics = compute_similarity(merged_a, merged_b)
             if is_candidate(
-                metrics, 
+                metrics,
                 min_score=min_score,
                 a_tokens=merged_a.tokens,
                 b_tokens=merged_b.tokens,
@@ -290,8 +289,12 @@ def align_segments(
                 ),
                 b_segment=merged_b,
                 metrics=SimilarityMetrics(
-                    jaccard=0.0, char_3gram=0.0, levenshtein=0.0,
-                    length_ratio=0.0, rare_token_overlap=0.0, rare_token_missing=1.0,
+                    jaccard=0.0,
+                    char_3gram=0.0,
+                    levenshtein=0.0,
+                    length_ratio=0.0,
+                    rare_token_overlap=0.0,
+                    rare_token_missing=1.0,
                 ),
                 is_forced_like=True,
             )
 
@@ -0,0 +1,50 @@
+"""Checkpoint persistence for interrupted comparisons."""
+
+import logging
+from pathlib import Path
+
+from subs_diff.report import generate_report, load_report_json, save_report_json
+from subs_diff.types import Issue
+
+logger = logging.getLogger(__name__)
+
+
+def save_checkpoint(
+    issues: list[Issue],
+    out_file: str | Path | None,
+    processed: int = 0,
+    total: int = 0,
+) -> None:
+    """Save partial comparison results in the report JSON shape."""
+    if out_file is None:
+        return
+
+    try:
+        report = generate_report(
+            issues=issues,
+            stt_file="",
+            ref_file="",
+            config={"partial": True, "processed": processed, "total": total},
+        )
+        save_report_json(report, out_file)
+        logger.info("Чекпоинт сохранён: %s/%s проблем обработано", processed, total)
+    except Exception as exc:
+        logger.error("Ошибка сохранения чекпоинта: %s", exc)
+
+
+def load_resume_checkpoint(out_file: str | Path) -> tuple[list[Issue], int]:
+    """
+    Load a partial checkpoint from a report JSON file.
+
+    Returns:
+        A pair of restored issues and processed candidate count. If the file is
+        absent or is not a partial checkpoint, returns an empty checkpoint.
+    """
+    path = Path(out_file)
+    if not path.exists():
+        return [], 0
+
+    report = load_report_json(path)
+    cfg = report.metadata.config
+    processed = int(cfg.get("processed", 0)) if cfg.get("partial") else 0
+    return report.issues, processed