# app.py
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel
import uvicorn
import io
import pymupdf  # ← use PyMuPDF directly (no 'fitz')
from typing import List, Optional, Dict, Any
from PIL import Image
import pytesseract
import re
from fastapi.middleware.cors import CORSMiddleware
import json
from difflib import SequenceMatcher
from openai import OpenAI
import os

app = FastAPI(title="PDF to Text (with OCR)", version="1.1.0", root_path="/parseai")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],      # or ["*"] for testing
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

class PageResult(BaseModel):
    page: int
    method: str  # "text", "ocr", or "ocr_failed"
    text: str


class InvoiceData(BaseModel):
    INVOICE_NUMBER: str = ""
    INVOICE_DATE: str = ""
    TERMS: str = ""
    VENDOR_NAME: str = ""
    RESERVATION_NUMBER: str = ""
    TAX_PERCENTAGE: str = ""
    TAX_AMOUNT: str = ""
    DISCOUNT_AMOUNT: str = ""
    SUBTOTAL_AMOUNT: str = ""
    INVOICE_AMOUNT: str = ""
    BILLING_ADDRESS: str = ""
    TOTAL_AMOUNT: str = ""
    TIP_AMOUNT: str = ""
    PAYMENT_METHOD: str = ""
    PAYMENT_TYPE: str = ""
    PAYMENT_ID: str = ""


class ExtractResponse(BaseModel):
    filename: str
    pages: List[PageResult]
    text: str
    gpt_parsed_data: Optional[Dict[str, Any]] = None
    match_percentage: Optional[float] = None


def _mostly_whitespace(s: str) -> bool:
    if not s:
        return True
    non_ws = len(re.sub(r"\s+", "", s))
    return non_ws < max(40, int(0.10 * len(s)))


def _page_to_image(page: "pymupdf.Page", dpi: int = 300) -> Image.Image:
    """Render a page to a PIL Image without external tools."""
    mat = pymupdf.Matrix(dpi / 72, dpi / 72)
    pix = page.get_pixmap(matrix=mat, alpha=False)
    img_bytes = pix.tobytes("png")
    return Image.open(io.BytesIO(img_bytes))


def _ocr_image(img: Image.Image) -> str:
    # On macOS/Homebrew, you can explicitly set the binary if needed:
    #pytesseract.pytesseract.tesseract_cmd = "/opt/homebrew/bin/tesseract"
    # English-only OCR (as requested)
    return pytesseract.image_to_string(img, lang="eng")


def calculate_match_percentage(extracted_text: str, gpt_data: Dict[str, Any]) -> float:
    """
    Calculate match percentage between extracted text and GPT parsed data.
    Checks if GPT extracted values are present in the original text.
    """
    if not gpt_data or not extracted_text:
        return 0.0
    
    extracted_text_lower = extracted_text.lower()
    matches = 0
    total_fields = 0
    
    for key, value in gpt_data.items():
        if key == "error":  # Skip error field
            continue
            
        if value and str(value).strip():  # Only check non-empty values
            total_fields += 1
            value_str = str(value).lower().strip()
            
            # Remove common separators for better matching
            value_clean = re.sub(r'[-/\s]+', '', value_str)
            text_clean = re.sub(r'[-/\s]+', '', extracted_text_lower)
            
            # Check if the value or parts of it exist in extracted text
            if value_str in extracted_text_lower or value_clean in text_clean:
                matches += 1
            else:
                # Use fuzzy matching for partial matches
                # Check against chunks of text for better matching
                words = value_str.split()
                if len(words) > 1:
                    # For multi-word values, check if most words are present
                    word_matches = sum(1 for word in words if len(word) > 2 and word in extracted_text_lower)
                    if word_matches >= len(words) * 0.6:  # 60% of words match
                        matches += 0.8
                else:
                    # For single values, use sequence matching
                    similarity = SequenceMatcher(None, value_str, extracted_text_lower).ratio()
                    if similarity > 0.6:  # 60% similarity threshold
                        matches += 0.5  # Partial match
    
    return round((matches / total_fields * 100) if total_fields > 0 else 0.0, 2)


def call_gpt_for_invoice_parsing(text: str, api_key: str) -> Dict[str, Any]:
    """
    Call OpenAI GPT-4 API to parse invoice data from text using official OpenAI package.
    """
    try:
        # Initialize OpenAI client
        client = OpenAI(api_key=api_key)
        
        # Make the API call
        response = client.chat.completions.create(
            model="gpt-4.1",  # You can also use "gpt-4" or "gpt-4-1106-preview"
            messages=[
                {
                    "role": "system",
                    "content": """You are an expert in document understanding. Extract structured data from invoice text and date format should be MM-DD-YYYY and remove currency symbol and return it in the following JSON format:
{
  "INVOICE_NUMBER": "",
  "INVOICE_DATE": "",
  "TERMS": "",
  "VENDOR_NAME": "",
  "RESERVATION_NUMBER": "",
  "TAX_PERCENTAGE": "",
  "TAX_AMOUNT": "",
  "DISCOUNT_AMOUNT": "",
  "SUBTOTAL_AMOUNT": "",
  "INVOICE_AMOUNT": "",
  "BILLING_ADDRESS": "",
  "TOTAL_AMOUNT": "",
  "TIP_AMOUNT": "",
  "PAYMENT_METHOD": "", #CAN ONLY BE CARD OR DIGITAL
  "PAYMENT_TYPE": "", #PICK the payment type acooding to the payment method
  "PAYMENT_ID": "" 
}

IMPORTANT: Return ONLY valid JSON. Do not include any markdown formatting, code blocks, or explanatory text. Just the raw JSON object."""
                },
                {
                    "role": "user",
                    "content": f"Extract invoice data from this text:\n\n{text}"
                }
            ],
            max_tokens=1500,
            temperature=0.3,  # Lower temperature for more consistent output
            response_format={"type": "json_object"}  # Force JSON response
        )
        
        # Get the response content
        content = response.choices[0].message.content
        
        if not content:
            raise Exception("Empty response from GPT")
        
        # Parse the JSON response
        try:
            # Clean up the content
            content = content.strip()
            
            # Remove markdown code blocks if present
            if content.startswith("```json"):
                content = content[7:]
            elif content.startswith("```"):
                content = content[3:]
            
            if content.endswith("```"):
                content = content[:-3]
            
            content = content.strip()
            
            # Parse JSON
            parsed_data = json.loads(content)
            
            # Validate that it's a dictionary
            if not isinstance(parsed_data, dict):
                raise Exception("Response is not a JSON object")
            
            return parsed_data
            
        except json.JSONDecodeError as e:
            print(f"JSON Parse Error: {str(e)}")
            print(f"Content received: {content}")
            raise Exception(f"Invalid JSON from GPT: {str(e)}")
    
    except Exception as e:
        print(f"GPT API Error: {str(e)}")
        raise Exception(f"Failed to parse invoice with GPT: {str(e)}")


@app.get("/health")
def health():
    return {"status": "ok"}


@app.post("/extract", response_model=ExtractResponse)
async def extract(
    file: UploadFile = File(..., description="PDF to extract"),
    password: Optional[str] = Form(None, description="Password if the PDF is encrypted"),
    force_ocr: bool = Form(False, description="If true, OCR every page regardless"),
    ocr_dpi: int = Form(300, description="DPI when rasterizing for OCR (300–400 recommended)"),
    enable_gpt_parsing: bool = Form(False, description="Enable GPT-4 invoice parsing"),
):
    """
    Extract text from a PDF using PyMuPDF's native text first; fall back to Tesseract OCR when needed.
    OCR language is fixed to English (eng).
    Optionally parse invoice data using GPT-4 and calculate match percentage.
    """
    try:
        openai_api_key = "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"
        data = await file.read()
        if not data:
            raise HTTPException(status_code=400, detail="Empty file")

        # Open from bytes; handle encryption
        doc = pymupdf.open(stream=data, filetype="pdf")
        if doc.needs_pass:
            if not password or not doc.authenticate(password):
                raise HTTPException(status_code=401, detail="PDF is encrypted; valid password required")

        pages: List[PageResult] = []
        all_text: List[str] = []

        for i in range(doc.page_count):
            page = doc.load_page(i)
            method = "text"
            page_text = page.get_text("text") if not force_ocr else ""

            if force_ocr or not page_text or _mostly_whitespace(page_text):
                try:
                    img = _page_to_image(page, dpi=ocr_dpi)
                    page_text = _ocr_image(img)
                    method = "ocr"
                except Exception:
                    method = "ocr_failed"
                    page_text = page_text or ""

            pages.append(PageResult(page=i + 1, method=method, text=page_text))
            all_text.append(page_text)

        full_text = "\n\n".join(all_text)
        
        # GPT Parsing (if enabled)
        gpt_parsed_data = None
        match_percentage = None
        
        if enable_gpt_parsing:
            if not openai_api_key:
                raise HTTPException(status_code=400, detail="OpenAI API key required for GPT parsing")
            
            try:
                print(f"Calling GPT with text length: {len(full_text)}")
                gpt_parsed_data = call_gpt_for_invoice_parsing(full_text, openai_api_key)
                print(f"GPT response: {gpt_parsed_data}")
                
                if gpt_parsed_data and "error" not in gpt_parsed_data:
                    match_percentage = calculate_match_percentage(full_text, gpt_parsed_data)
                    print(f"Match percentage: {match_percentage}")
                    
            except Exception as e:
                # Log the error but don't fail the entire request
                error_msg = str(e)
                print(f"GPT parsing failed: {error_msg}")
                gpt_parsed_data = {
                    "error": error_msg,
                    "status": "failed"
                }
                match_percentage = 0.0

        response_data = ExtractResponse(
            filename=file.filename or "upload.pdf",
            pages=pages,
            text=full_text,
            gpt_parsed_data=gpt_parsed_data,
            match_percentage=match_percentage
        )

        return JSONResponse(
            status_code=200,
            content=response_data.model_dump(),
        )

    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Failed to process PDF: {e}")


if __name__ == "__main__":
    uvicorn.run("app:app", host="0.0.0.0", port=8600, reload=True)