import os
import re
import fitz  # PyMuPDF
import layoutparser as lp
from PIL import Image
from collections import defaultdict
import html as html_utils
import io
import os

os.environ["TORCH_HOME"] = "/var/www/teggl/fontify/cache"
os.environ["XDG_CACHE_HOME"] = "/var/www/teggl/fontify/cache"


def is_likely_page_number(text, bbox, page_height):
    stripped = text.strip()
    if not stripped.isdigit():
        return False
    if bbox[1] < page_height * 0.1 or bbox[1] > page_height * 0.9:
        return True
    return False


def is_likely_footer_header(text, bbox, page_height):
    stripped = text.strip()
    if not stripped:
        return False
    if bbox[1] > page_height * 0.9 or bbox[1] < page_height * 0.1:
        patterns = [
            r"\bVolume\b.*\bIssue\b",
            r"\b(19|20)\d{2}\b.*\b\d{1,4}–\d{1,4}\b",
            r"Inf Syst Front",
            r"JETOL",
            r"Springer",
            r"Elsevier",
            r"\bReferences\b",
        ]
        for pat in patterns:
            if re.search(pat, stripped, re.IGNORECASE):
                return True
    return False


def merge_adjacent_blocks(blocks):
    merged = []
    used = set()
    for i, a in enumerate(blocks):
        if i in used:
            continue
        xa1, ya1, xa2, ya2 = a.coordinates
        box_a = fitz.Rect(xa1, ya1, xa2, ya2)
        group = [a]
        for j, b in enumerate(blocks[i + 1 :], start=i + 1):
            if j in used or b.type != a.type:
                continue
            xb1, yb1, xb2, yb2 = b.coordinates
            box_b = fitz.Rect(xb1, yb1, xb2, yb2)
            if box_a.intersects(box_b):
                group.append(b)
                used.add(j)
        if len(group) == 1:
            merged.append(a)
        else:
            x1 = min(g.coordinates[0] for g in group)
            y1 = min(g.coordinates[1] for g in group)
            x2 = max(g.coordinates[2] for g in group)
            y2 = max(g.coordinates[3] for g in group)
            new_block = lp.TextBlock(
                lp.Rectangle(x1, y1, x2, y2),
                text=None,
                type=a.type,
                score=getattr(a, "score", None),
                id=getattr(a, "id", None),
            )
            merged.append(new_block)
    return merged


def extract_blocks_with_ai_visuals(pdf_path):
    UPLOAD_DIR = os.path.join(os.path.dirname(__file__), "static", "uploads")
    os.makedirs(UPLOAD_DIR, exist_ok=True)

    doc = fitz.open(pdf_path)
    model = lp.AutoLayoutModel("lp://efficientdet/PubLayNet")
    all_blocks = []

    for page_number, page in enumerate(doc, start=1):
        pix = page.get_pixmap(dpi=200)
        img = Image.open(io.BytesIO(pix.tobytes("png")))
        layout = model.detect(img)
        layout = merge_adjacent_blocks(layout)

        visual_types = ["Table", "Figure"]
        for block_id, block in enumerate(layout):
            x1, y1, x2, y2 = map(int, block.coordinates)
            bbox = [x1, y1, x2, y2]
            if block.type in visual_types:
                cropped = img.crop((x1, y1, x2, y2))
                img_filename = f"ai_{page_number}_{block_id}_{block.type.lower()}.png"
                img_path = os.path.join(UPLOAD_DIR, img_filename)
                cropped.save(img_path)

                all_blocks.append(
                    {
                        "type": "image",
                        "page": page_number,
                        "src": img_filename,
                        "bbox": bbox,
                        "label": block.type,
                    }
                )
                page.draw_rect(fitz.Rect(bbox), color=(1, 1, 1), fill=(1, 1, 1))

        text_blocks = page.get_text("blocks")
        for block_id, block in enumerate(text_blocks):
            x0, y0, x1, y1, text, *_ = block
            if not text.strip():
                continue
            if is_likely_page_number(text, [x0, y0, x1, y1], page.rect.height):
                continue
            if is_likely_footer_header(text, [x0, y0, x1, y1], page.rect.height):
                continue
            bbox = fitz.Rect(x0, y0, x1, y1)
            all_blocks.append(
                {
                    "id": f"{page_number}_{block_id}",
                    "page": page_number,
                    "bbox": [x0, y0, x1, y1],
                    "text": text.strip(),
                }
            )

    return all_blocks


def determine_column_aware_reading_order(blocks, column_threshold=100):
    pages = defaultdict(list)
    for block in blocks:
        pages[block["page"]].append(block)

    ordered_blocks = []
    for page_num in sorted(pages):
        page_blocks = pages[page_num]
        text_blocks = [b for b in page_blocks if b.get("type") is None]
        other_blocks = [b for b in page_blocks if b.get("type") == "image"]

        columns = []
        for block in sorted(text_blocks, key=lambda b: b["bbox"][0]):
            placed = False
            for col in columns:
                if abs(block["bbox"][0] - col[0]["bbox"][0]) < column_threshold:
                    col.append(block)
                    placed = True
                    break
            if not placed:
                columns.append([block])

        for col in columns:
            col.sort(key=lambda b: b["bbox"][1])
        for col in columns:
            ordered_blocks.extend(col)
        ordered_blocks.extend(other_blocks)

    return ordered_blocks


def is_main_title(text, bbox, page):
    return page == 1 and len(text.strip()) > 60 and bbox[1] < 150


def is_section_heading(text):
    return re.match(r"^\d+\s+[A-Z]", text.strip())


def is_subsection_heading(text):
    return re.match(r"^\d+\.\d+(\.\d+)?\s+[A-Z]", text.strip())


def is_subheading(text):
    return bool(re.match(r"^\d+\.\d+\s", text.strip()))


def is_heading(text, bbox=None):
    stripped = text.strip()
    if len(stripped) > 100:
        return False
    if re.search(
        r"(doi|https?://|Inf Syst Front|\(\d{4}\)|[A-Z]{2,} \(\d{4}\))",
        stripped,
        re.IGNORECASE,
    ):
        return False
    if re.fullmatch(r"\d+|\d+–\d+|\d+/\d+", stripped):
        return False
    if bbox and bbox[1] > 250:
        return False
    if re.match(r"^\d+(\.\d+)*\s+[A-Z]", stripped):
        return True
    if stripped.isupper() or stripped.istitle():
        return True
    return False


def generate_html_from_blocks(blocks, styles):
    font = styles["font"]
    size = styles["size"]
    line_height = styles["line_height"]
    spacing = styles["spacing"]

    font_face_block = f"""
    @font-face {{
        font-family: {font};
        src: url("https://fontify.teggl.com/static/fonts/{font}.ttf") format("truetype");
    }}
    """
    html_content = f"""<!DOCTYPE html>
    <html>
    <head>
        <meta charset='utf-8'>
        <style>
            {font_face_block}
            body {{
                font-family: {font};
                font-size: {size}px;
                line-height: {line_height};
                word-spacing: {spacing}px;
                text-align: left;
                margin: 2em;
            }}
            table {{
                border-collapse: collapse;
                margin: 1em 0;
                width: 100%;
            }}
            table td, table th {{
                border: 1px solid #aaa;
                padding: 5px;
            }}
            img {{
                max-width: 100%;
                margin: 1em 0;
            }}
        </style>
    </head>
    <body>
    """

    title_set = False
    for block in blocks:
        if block.get("type") == "image":
            html_content += f'<img src="https://fontify.teggl.com/static/uploads/{block["src"]}" alt="{block.get("label", "Visual Block")}">'
        else:
            raw_text = block["text"]
            clean_text = html_utils.escape(raw_text.replace("\n", " "))
            bbox = block.get("bbox", [])
            page = block.get("page", 1)

            if not title_set and is_main_title(raw_text, bbox, page):
                html_content += f"<h1>{clean_text}</h1>"
                title_set = True
            elif is_section_heading(raw_text):
                html_content += f"<h2>{clean_text}</h2>"
            elif is_subsection_heading(raw_text):
                html_content += f"<h3>{clean_text}</h3>"
            elif is_subheading(raw_text):
                html_content += f"<h3>{clean_text}</h3>"
            elif is_heading(raw_text, bbox):
                html_content += f"<h2>{clean_text}</h2><hr>"
            else:
                html_content += f"<p>{clean_text}</p>"

    html_content += "</body></html>"
    return html_content