上传前人工裁剪错题区域，OCR 原文排除手写作答。

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-28 16:01:46 +08:00
parent 23be608521
commit acfe002fbf
18 changed files with 975 additions and 448 deletions
@@ -107,3 +107,51 @@ def split_solution_sections(text: str) -> tuple[str | None, str]:
    approach = parts[0].replace("## 解题思路", "").strip()
    rest = "## " + parts[1]
    return approach or None, rest.strip()
+
+
+def union_bbox(bboxes: list[list[float]], img_w: int, img_h: int, padding_ratio: float = 0.06) -> list[int]:
+    if not bboxes:
+        return [0, 0, img_w, img_h]
+    x1 = min(b[0] for b in bboxes)
+    y1 = min(b[1] for b in bboxes)
+    x2 = max(b[2] for b in bboxes)
+    y2 = max(b[3] for b in bboxes)
+    pad_x = max(8, (x2 - x1) * padding_ratio)
+    pad_y = max(8, (y2 - y1) * padding_ratio)
+    return [
+        int(max(0, x1 - pad_x)),
+        int(max(0, y1 - pad_y)),
+        int(min(img_w, x2 + pad_x)),
+        int(min(img_h, y2 + pad_y)),
+    ]
+
+
+def cropped_rel_path(original_rel: str) -> str:
+    p = Path(original_rel)
+    return str(p.parent / f"{p.stem}_crop.jpg")
+
+
+def crop_wrong_region(
+    src_path: str,
+    lines: list[dict],
+    wrong_ids: list[int],
+    dest_rel_path: str,
+    img_width: int,
+    img_height: int,
+) -> str | None:
+    if not wrong_ids:
+        return None
+    bboxes = [lines[i].get("bbox") or [0, 0, 0, 0] for i in wrong_ids if i < len(lines)]
+    if not bboxes:
+        return None
+    box = union_bbox(bboxes, img_width, img_height, padding_ratio=0.12)
+    x1, y1, x2, y2 = box
+    if x2 <= x1 or y2 <= y1:
+        return None
+
+    img = Image.open(src_path).convert("RGB")
+    cropped = img.crop((x1, y1, x2, y2))
+    full_path = Path(settings.UPLOAD_DIR) / dest_rel_path
+    full_path.parent.mkdir(parents=True, exist_ok=True)
+    cropped.save(full_path, format="JPEG", quality=92)
+    return dest_rel_path
@@ -77,6 +77,8 @@ def run_migrations() -> None:
            wq_alters.append("ADD COLUMN mark_regions_json TEXT")
        if "annotated_image_path" not in wq_columns:
            wq_alters.append("ADD COLUMN annotated_image_path VARCHAR(512)")
+        if "cropped_image_path" not in wq_columns:
+            wq_alters.append("ADD COLUMN cropped_image_path VARCHAR(512)")
        if "error_message" not in wq_columns:
            wq_alters.append("ADD COLUMN error_message TEXT")
        if wq_alters:
@@ -0,0 +1,103 @@
+"""OCR 行分类：区分印刷题干与手写作答。"""
+import re
+
+# 印刷体/题干常见特征
+_PRINTED_RE = re.compile(
+    r"(第\s*[0-9一二三四五六七八九十百]+题|"
+    r"[（(]\s*[0-9一二三四五六七八九十]+\s*[）)]|"
+    r"^\s*[0-9]{1,2}\s*[\.．、\)]|"
+    r"^[A-Da-d]\s*[\.．、]|"
+    r"选择题|填空题|解答题|证明题|计算题|应用题|"
+    r"下列|以下|正确|错误|不正确|单选|多选|"
+    r"已知|求证|设|若|求|如图|如图所示)",
+    re.MULTILINE,
+)
+
+# 手写作答常见特征（算式、短碎片）
+_HANDWRITE_RE = re.compile(
+    r"^[0-9\s+\-×÷*/=≈<>()\[\].,，、%°]+$|"
+    r"^[xXyYzZ]\s*[=＝]|"
+    r"^\s*\d+\s*[\.．]\s*\d*\s*$"
+)
+
+
+def _line_center_y(line: dict) -> float:
+    bbox = line.get("bbox") or [0, 0, 0, 0]
+    return (float(bbox[1]) + float(bbox[3])) / 2.0
+
+
+def _looks_printed(text: str) -> bool:
+    t = text.strip()
+    if len(t) >= 12 and _PRINTED_RE.search(t):
+        return True
+    if _PRINTED_RE.match(t):
+        return True
+    return False
+
+
+def _looks_handwritten(text: str, confidence: float) -> bool:
+    t = text.strip()
+    if not t:
+        return True
+    if _looks_printed(t):
+        return False
+    if _HANDWRITE_RE.match(t):
+        return True
+    if len(t) <= 6 and confidence < 0.92:
+        return True
+    digit_ratio = sum(c.isdigit() or c in "+-×÷*/=≈.%" for c in t) / max(len(t), 1)
+    if digit_ratio > 0.55 and len(t) < 20:
+        return True
+    return False
+
+
+def split_printed_handwriting(
+    lines: list[dict],
+    img_height: int,
+    *,
+    answer_zone_ratio: float = 0.45,
+    enabled: bool = True,
+) -> tuple[list[int], list[int]]:
+    """
+    返回 (印刷题干行编号, 手写作答行编号)，编号为 lines 列表下标。
+    answer_zone_ratio: 图片高度比例，低于此 y 中心视为题干区，高于视为作答区。
+    """
+    if not lines or not enabled or img_height <= 0:
+        return list(range(len(lines))), []
+
+    split_y = img_height * answer_zone_ratio
+    printed_ids: list[int] = []
+    handwriting_ids: list[int] = []
+
+    for i, line in enumerate(lines):
+        text = line.get("text", "")
+        conf = float(line.get("confidence") or 0.0)
+        cy = _line_center_y(line)
+
+        if _looks_printed(text):
+            printed_ids.append(i)
+            continue
+
+        in_answer_zone = cy >= split_y
+        if in_answer_zone and _looks_handwritten(text, conf):
+            handwriting_ids.append(i)
+        elif not in_answer_zone:
+            printed_ids.append(i)
+        elif in_answer_zone:
+            handwriting_ids.append(i)
+
+    if not printed_ids and lines:
+        printed_ids = list(range(min(3, len(lines))))
+
+    if not handwriting_ids and len(lines) >= 2:
+        handwriting_ids = list(range(max(0, len(lines) - 3), len(lines)))
+
+    return printed_ids, handwriting_ids
+
+
+def lines_by_indices(lines: list[dict], indices: list[int]) -> list[dict]:
+    return [lines[i] for i in indices if 0 <= i < len(lines)]
+
+
+def text_from_indices(lines: list[dict], indices: list[int]) -> str:
+    return "\n".join(lines[i].get("text", "") for i in indices if 0 <= i < len(lines)).strip()