rfp_response_automation/files/oci_genai_llm_graphrag_rerank_rfp.py

from langchain_community.chat_models.oci_generative_ai import ChatOCIGenAI
from langchain_core.prompts import PromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain_community.embeddings import OCIGenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema.runnable import RunnableMap
from langchain_community.document_loaders import UnstructuredPDFLoader, PyMuPDFLoader
from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda
from pathlib import Path
from tqdm import tqdm
import os
import pickle
import re
import atexit
import oracledb
import json
import base64
import hashlib
from datetime import datetime
import requests
import textwrap
import unicodedata
from typing import Optional
from collections import deque
from langchain.callbacks.base import BaseCallbackHandler
from langdetect import detect
from config_loader import load_config
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

def chunk_hash(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8")).hexdigest()

config = load_config()

# =========================
# Oracle Autonomous Configuration
# =========================
WALLET_PATH = config.wallet_path
DB_ALIAS = config.db_alias
USERNAME = config.username
PASSWORD = config.password
os.environ["TNS_ADMIN"] = WALLET_PATH

# =========================
# Global Configurations
# =========================
INDEX_PATH = config.index_path
PROCESSED_DOCS_FILE = os.path.join(INDEX_PATH, "processed_docs.pkl")
chapter_separator_regex = r"^(#{1,6} .+|\*\*.+\*\*)$"
GRAPH_NAME = config.graph_name
LOG_BUFFER = deque(maxlen=500)
MAX_ATTEMPTS = 3
GENAI_MAX_CONCURRENT = 1000
GENAI_SEMAPHORE = threading.Semaphore(GENAI_MAX_CONCURRENT)

def call_llm(fn, *args, **kwargs):
    with GENAI_SEMAPHORE:
        return fn(*args, **kwargs)

# =========================
# LLM Definitions
# =========================

llm = ChatOCIGenAI(
    model_id=config.llm_model,
    service_endpoint=config.service_endpoint,
    compartment_id=config.compartment_id,
    auth_profile=config.auth_profile,
    model_kwargs={"temperature": 0, "top_p": 1, "max_tokens": 4000},
)

llm_for_rag = ChatOCIGenAI(
    model_id=config.llm_model,
    service_endpoint=config.service_endpoint,
    compartment_id=config.compartment_id,
    auth_profile=config.auth_profile,
)

lrm_for_architect = ChatOCIGenAI(
    model_id=config.llm_model,
    service_endpoint=config.service_endpoint,
    compartment_id=config.compartment_id,
    auth_profile=config.auth_profile,
)

embeddings = OCIGenAIEmbeddings(
    model_id=config.embedding_model,
    service_endpoint=config.service_endpoint,
    compartment_id=config.compartment_id,
    auth_profile=config.auth_profile,
)

oracle_conn = oracledb.connect(
    user=USERNAME,
    password=PASSWORD,
    dsn=DB_ALIAS,
    config_dir=WALLET_PATH,
    wallet_location=WALLET_PATH,
    wallet_password=PASSWORD
)
atexit.register(lambda: oracle_conn.close())

def filename_to_url(filename: str, suffix: str = ".pdf") -> str:
    if filename.endswith(suffix):
        filename = filename[: -len(suffix)]
    decoded = base64.urlsafe_b64decode(filename.encode("ascii"))
    return decoded.decode("utf-8")

def default_logger(msg):
    print(msg)

# =========================================
# ARCHITECTURE-SPECIFIC SOURCE RANKING
# =========================================

class BrowserLogCallback(BaseCallbackHandler):

    def __init__(self, logger):
        self.log = logger

    # ---------- CHAIN ----------
    def on_chain_start(self, serialized, inputs, **kwargs):
        self.log("🔵 Chain started")

    def on_chain_end(self, outputs, **kwargs):
        self.log("🟢 Chain finished")

    # ---------- LLM ----------
    def on_llm_start(self, serialized, prompts, **kwargs):
        self.log("🤖 LLM call started")
        self.log(f"📏 Prompt size: {len(prompts[0])} chars")

    def on_llm_end(self, response, **kwargs):
        self.log("✅ LLM response received")

    # ---------- RETRIEVER ----------
    def on_retriever_start(self, serialized, query, **kwargs):
        self.log(f"🔍 Searching vector store: {query}")

    def on_retriever_end(self, documents, **kwargs):
        self.log(f"📚 Retrieved {len(documents)} chunks")

    # ---------- ERRORS ----------
    def on_chain_error(self, error, **kwargs):
        self.log(f"❌ ERROR: {error}")

ARCH_GOOD_HINTS = [
    "overview",
    "concept",
    "architecture",
    "service",
    "use-case"
]

ARCH_BAD_HINTS = [
    "home",
    "index",
    "portal",
    "release-notes",
    "troubleshoot",
    "known-issues"
]


def score_arch_url(url: str) -> int:
    if not url:
        return 0

    u = url.lower()
    score = 0

    for g in ARCH_GOOD_HINTS:
        if g in u:
            score += 3

    for b in ARCH_BAD_HINTS:
        if b in u:
            score -= 5

    if "docs.oracle.com" in u:
        score += 2

    return score


def resolve_arch_source(doc):
    """
    Igual resolve_chunk_source, mas com ranking específico para arquitetura.
    NÃO afeta o resto do pipeline.
    """
    text = doc.page_content or ""
    md = doc.metadata or {}

    candidates = []

    candidates += URL_REGEX.findall(text)

    if md.get("reference"):
        candidates.append(md["reference"])

    if md.get("source"):
        candidates.append(md["source"])

    if not candidates:
        return "Oracle Cloud Infrastructure documentation"

    candidates = list(set(candidates))
    candidates.sort(key=score_arch_url, reverse=True)

    return candidates[0]

def strip_accents(s: str) -> str:
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalize_lang(code: str) -> str:
    mapping = {
        "pt": "Portuguese",
        "en": "English",
        "es": "Spanish",
        "fr": "French",
        "de": "German",
        "it": "Italian"
    }
    return mapping.get(code, "English")

# =========================
# SOURCE VALIDATION (POST LLM)
# =========================

INVALID_SOURCE_TOKEN = "---------"
URL_TIMEOUT = 3

_url_cache = {}

def url_exists(url: str) -> bool:

    if not url or not url.startswith("http"):
        return False

    if url in _url_cache:
        return _url_cache[url]

    try:
        r = requests.get(
            url,
            timeout=URL_TIMEOUT,
            allow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0"}
        )

        if r.status_code >= 400:
            _url_cache[url] = False
            return False

        html = (r.text or "").lower()

        # ====================================
        # 🔥 ORACLE SOFT-404 TEMPLATE DETECTION
        # ====================================
        soft_404_patterns = [
            "<title>page not found",
            "that page is not available",
            "class=\"page-not-found\"",
            "redwood-light-404.css",
            "error-container"
        ]

        if any(p in html for p in soft_404_patterns):
            _url_cache[url] = False
            return False

        # ====================================
        # 🔥 conteúdo mínimo REAL (sem menu)
        # ====================================

        from bs4 import BeautifulSoup

        soup = BeautifulSoup(html, "html.parser")

        for tag in soup(["script", "style", "nav", "footer", "header"]):
            tag.decompose()

        visible_text = soup.get_text(" ", strip=True)

        # se depois de remover layout quase não sobra texto => stub
        # if len(visible_text) < 300:
        #     _url_cache[url] = False
        #     return False

        _url_cache[url] = True
        return True

    except:
        _url_cache[url] = False
        return False

def validate_and_sanitize_sources(answer: dict) -> dict:
    if not isinstance(answer, dict):
        return {"answer": "NO", "confidence": "LOW", "ambiguity_detected": True,
                "confidence_reason": "Invalid answer type", "justification": "", "evidence": []}

    # cópia shallow do topo + cópia da lista evidence
    out = dict(answer)
    evidences = out.get("evidence", [])

    if not isinstance(evidences, list):
        return out

    new_evidences = []
    for ev in evidences:
        if not isinstance(ev, dict):
            new_evidences.append(ev)
            continue

        ev2 = dict(ev)
        src = ev2.get("source")

        if isinstance(src, list):
            ev2["source"] = [
                s if (isinstance(s, str) and url_exists(s)) else INVALID_SOURCE_TOKEN
                for s in src
            ]
        else:
            ev2["source"] = (
                src if (isinstance(src, str) and url_exists(src)) else INVALID_SOURCE_TOKEN
            )

        if ev2["source"] == INVALID_SOURCE_TOKEN:
            print(src)

        new_evidences.append(ev2)

    out["evidence"] = new_evidences
    return out

# =========================
# LRM Definitions
# =========================
def build_architecture_evidence(docs, max_chunks=30):
    ranked = sorted(
        docs,
        key=lambda d: score_arch_url(resolve_arch_source(d)),
        reverse=True
    )

    evidence = []

    for d in ranked[:max_chunks]:
        quote = d.page_content[:3000]
        quote = re.sub(r"Reference:\s*\S+", "", quote)

        evidence.append({
            "quote": quote,
            "source": resolve_arch_source(d)
        })

    return evidence

def enforce_architecture_sources(plan: dict, evidence: list[dict]) -> dict:
    if not evidence:
        return plan

    ev_list = [e for e in evidence if e.get("source")]
    valid_sources = {
        str(e.get("source", ""))
        for e in ev_list
        if e.get("source")
    }

    def pick_best_source(service: str) -> dict | None:
        if not service:
            return None

        s = service.lower()

        best = None
        best_score = -1

        service_terms = [t for t in re.findall(r"[a-z0-9]+", s) if len(t) >= 3]

        for e in ev_list:
            hay = (e.get("quote", "") + " " + e.get("source", "")).lower()
            score = sum(1 for t in service_terms if t in hay)

            if score > best_score:
                best_score = score
                best = e

        return best if best_score > 0 else None

    for d in plan.get("decisions", []):
        service = d.get("service", "")
        ev = d.get("evidence", {}) or {}

        if ev.get("source") in valid_sources:
            continue

        best_ev = pick_best_source(service)
        if best_ev:
            d["evidence"] = {
                "quote": best_ev.get("quote", ev.get("quote", "")),
                "source": best_ev["source"],
            }
            continue

        if not ev.get("source"):
            d["evidence"] = {
                "quote": ev.get("quote", ""),
                "source": ev_list[0]["source"],
            }

    return plan

def build_architecture_chain():

    ARCH_PROMPT = PromptTemplate.from_template("""
        You are a senior OCI Cloud Architect.

        TASK:
        Design an OCI architecture for:

        {question}

        You MUST design the solution using the provided documentation evidence.

        **DOCUMENT EVIDENCE** (JSON):
        {text_context}

        **GRAPH FACTS**:
        {graph_context}

        Rules (MANDATORY):
        - Use ONLY services supported by **DOCUMENT EVIDENCE**.
        - The architecture may involve MULTIPLE OCI services; therefore decisions may require DIFFERENT sources (do not reuse a single URL for everything unless it truly applies).
        - Set each service from this OCI Services:
                Oracle Cloud Infrastructure Service (OCI):

                    Compute (IaaS)
                        •	Compute Instances (VM)
                        •	Bare Metal Instances
                        •	Dedicated VM Hosts
                        •	GPU Instances
                        •	Confidential Computing
                        •	Capacity Reservations
                        •	Autoscaling (Instance Pools)
                        •	Live Migration
                        •	Oracle Cloud VMware Solution (OCVS)
                        •	HPC (High Performance Computing)
                        •	Arm-based Compute (Ampere)

                    Storage

                        Object Storage
                        •	Object Storage
                        •	Object Storage – Archive
                        •	Pre-Authenticated Requests
                        •	Replication

                        Block & File
                        •	Block Volume
                        •	Boot Volume
                        •	Volume Groups
                        •	File Storage
                        •	File Storage Snapshots
                        •	Data Transfer Service

                    Networking
                        •	Virtual Cloud Network (VCN)
                        •	Subnets
                        •	Internet Gateway
                        •	NAT Gateway
                        •	Service Gateway
                        •	Dynamic Routing Gateway (DRG)
                        •	FastConnect
                        •	Load Balancer (L7 / L4)
                        •	Network Load Balancer
                        •	DNS
                        •	Traffic Management Steering Policies
                        •	IP Address Management (IPAM)
                        •	Network Firewall
                        •	Web Application Firewall (WAF)
                        •	Bastion
                        •	Capture Traffic (VTAP)
                        •	Private Endpoints

                    Security, Identity & Compliance
                        •	Identity and Access Management (IAM)
                        •	Compartments
                        •	Policies
                        •	OCI Vault
                        •	OCI Key Management (KMS)
                        •	OCI Certificates
                        •	OCI Secrets
                        •	OCI Bastion
                        •	Cloud Guard
                        •	Security Zones
                        •	Vulnerability Scanning Service
                        •	Data Safe
                        •	Logging
                        •	Audit
                        •	OS Management / OS Management Hub
                        •	Shielded Instances
                        •	Zero Trust Packet Routing

                    Databases

                        Autonomous
                            •	Autonomous Database (ATP)
                            •	Autonomous Data Warehouse (ADW)
                            •	Autonomous JSON Database

                        Databases Gerenciados
                            •	Oracle Database Service
                            •	Oracle Exadata Database Service
                            •	Exadata Cloud@Customer
                            •	Base Database Service
                            •	MySQL Database Service
                            •	MySQL HeatWave
                            •	NoSQL Database Cloud Service
                            •	TimesTen
                            •	PostgreSQL (OCI managed)
                            •	MongoDB API (OCI NoSQL compatibility)

                    Analytics & BI
                        •	Oracle Analytics Cloud (OAC)
                        •	OCI Data Catalog
                        •	OCI Data Integration
                        •	OCI Streaming Analytics
                        •	OCI GoldenGate
                        •	OCI Big Data Service (Hadoop/Spark)
                        •	OCI Data Science
                        •	OCI AI Anomaly Detection
                        •	OCI AI Forecasting

                    AI & Machine Learning

                        Generative AI
                        •	OCI Generative AI
                        •	OCI Generative AI Agents
                        •	OCI Generative AI RAG
                        •	OCI Generative AI Embeddings
                        •	OCI AI Gateway (OpenAI-compatible)

                    AI Services
                        •	OCI Vision (OCR, image analysis)
                        •	OCI Speech (STT / TTS)
                        •	OCI Language (NLP)
                        •	OCI Document Understanding
                        •	OCI Anomaly Detection
                        •	OCI Forecasting
                        •	OCI Data Labeling

                    Containers & Cloud Native
                        •	OCI Container Engine for Kubernetes (OKE)
                        •	Container Registry (OCIR)
                        •	Service Mesh
                        •	API Gateway
                        •	OCI Functions (FaaS)
                        •	OCI Streaming (Kafka-compatible)
                        •	OCI Queue
                        •	OCI Events
                        •	OCI Resource Manager (Terraform)

                    Integration & Messaging
                        •	OCI Integration Cloud (OIC)
                        •	OCI Service Connector Hub
                        •	OCI Streaming
                        •	OCI GoldenGate
                        •	OCI API Gateway
                        •	OCI Events Service
                        •	OCI Queue
                        •   Real Applications Clusters (RAC)

                    Developer Services
                        •	OCI DevOps (CI/CD)
                        •	OCI Code Repository
                        •	OCI Build Pipelines
                        •	OCI Artifact Registry
                        •	OCI Logging Analytics
                        •	OCI Monitoring
                        •	OCI Notifications
                        •	OCI Bastion
                        •	OCI CLI
                        •	OCI SDKs

                    Observability & Management
                        •	OCI Monitoring
                        •	OCI Alarms
                        •	OCI Logging
                        •	OCI Logging Analytics
                        •	OCI Application Performance Monitoring (APM)
                        •	OCI Operations Insights
                        •	OCI Management Agent
                        •	OCI Resource Discovery

                    Enterprise & Hybrid
                        •	Oracle Cloud@Customer
                        •	Exadata Cloud@Customer
                        •	Compute Cloud@Customer
                        •	Dedicated Region Cloud@Customer
                        •	OCI Roving Edge Infrastructure
                        •	OCI Alloy

                    Governance & FinOps
                        •	OCI Budgets
                        •	Cost Analysis
                        •	Usage Reports
                        •	Quotas
                        •	Tagging
                        •	Compartments
                        •	Resource Search

                    Regions & Edge
                        •	OCI Regions (Commercial, Government, EU Sovereign)
                        •	OCI Edge Services
                        •	OCI Roving Edge
                        •	OCI Dedicated Region

        STRICT SERVICE GROUNDING RULE (MANDATORY):
        - For each decision, use evidence from the SAME service_group as the decision service.
        - Do NOT justify one service using evidence from another service's documentation.

        SOURCE RULES (STRICT):
        - Copy ONLY URLs that appear EXACTLY in DOCUMENT EVIDENCE or GRAPH FACTS
        - NEVER create or guess URLs
        - If no URL is explicitly present, set source = null
        - It is allowed to return null
        - GIVE MANY SOURCES URL
        - GIVE A COMPLETE PATH OF URL SOURCES TO UNDERSTAND THE CONCEPTS THEME
            - GIVE one or more OVERVIEW SOURCE URL
            - GIVE one or more SOLUTION AND ARCHITECTURE SOURCE URL

        MANDATORY:
        - Break into requirements
        - Map each requirement to OCI services
        - Justify each choice

        LANGUAGE RULE (MANDATORY): **DO TRANSLATION AS THE LAST STEP**
        - Write ALL textual values in {lang}
        - Keep JSON keys in English
        - Do NOT translate keys

        Return ONLY JSON:

        {{
          "problem_summary": "...",
          "architecture": {{
            "components": [
              {{
                "id": "api",
                "service": "OCI API Gateway",
                "purpose": "...",
                "source": ["__AUTO__"],
                "connects_to": []
              }}
            ]
          }},
          "decisions": [
            {{
              "service": "...",
              "reason": "must cite evidence",
              "evidence": {{
                "quote": "...",
                "source": ["__AUTO__"]
              }}
            }}
          ]
        }}
    """)

    callback = BrowserLogCallback(default_logger)

    chain_arch = (
            RunnableLambda(lambda q: {
                "question": q,
                "req": parse_rfp_requirement(q)
            })
            | RunnableMap({
        "question": lambda x: x["question"],
        "text_context": lambda x: get_architecture_context(x["req"])["text_context"],
        "graph_context": lambda x: get_architecture_context(x["req"])["graph_context"],
        "lang": lambda x: normalize_lang(detect(x["question"]))
    })
            | ARCH_PROMPT
            | lrm_for_architect
            | StrOutputParser()
    ).with_config(callbacks=[callback])

    return chain_arch

def score_url_quality(url: str) -> int:
    if not url:
        return 0

    u = url.lower()
    score = 0

    if "/solutions/" in u:
        score += 8

    elif "youtube.com" in u or "youtu.be" in u:
        score += 7

    elif any(x in u for x in [
        "architecture", "overview", "concept", "how-to", "use-case"
    ]):
        score += 5

    elif "docs.oracle.com" in u:
        score += 3

    if any(x in u for x in [
        "home", "index", "portal", "release-notes", "faq", "troubleshoot"
    ]):
        score -= 10

    return score


def score_architecture_plan(plan: dict) -> int:
    if not plan:
        return -1

    score = 0

    comps = plan.get("architecture", {}).get("components", [])
    decisions = plan.get("decisions", [])

    score += len(comps) * 3
    score += len(decisions) * 4

    for d in decisions:
        ev = d.get("evidence", {}) or {}
        srcs = ev.get("source", [])

        if isinstance(srcs, str):
            srcs = [srcs]

        for s in srcs:
            score += score_url_quality(s)

        quote = ""
        if ev.get("quote", ""):
            quote = ev.get("quote", "")

        score += min(len(quote) // 500, 4)

    return score

def call_architecture_planner(
        question: str,
        parallel_attempts: int = MAX_ATTEMPTS,
        log=default_logger
):
    log("\n🏗️ ARCHITECTURE (PARALLEL SELF-CONSISTENCY)")

    def worker(i):
        print(0)
        # raw = chain_architecture.invoke(question)
        raw = call_llm(chain_architecture.invoke, question)
        print(1)
        plan = safe_parse_architecture_json(raw)
        print(2)

        try:
            score = score_architecture_plan(plan)
        except:
            print("Error scoring", plan)
            score = 0
        print(3)

        return {
            "attempt": i,
            "plan": plan,
            "score": score
        }

    results = []

    with ThreadPoolExecutor(max_workers=parallel_attempts) as executor:
        futures = [
            executor.submit(worker, i)
            for i in range(1, parallel_attempts + 1)
        ]

        for f in as_completed(futures):
            results.append(f.result())

    results.sort(key=lambda r: r["score"], reverse=True)

    for r in results:
        log(f"⚡ Attempt {r['attempt']} score={r['score']}")

    best = results[0]["plan"]

    log(f"\n🏆 Selected architecture from attempt {results[0]['attempt']}")


    plan = safe_parse_architecture_json(best)
    plan = validate_architecture_sources(plan)

    return plan

def architecture_to_mermaid(plan: dict) -> str:

    architecture = plan.get("architecture", {})
    comps = architecture.get("components", [])

    if not comps:
        return "flowchart TB\nEmpty[No components]"

    direction = "TB" if len(comps) > 6 else "LR"

    lines = [f"flowchart {direction}"]

    # nodes
    for c in comps:
        cid = c["id"]
        purpose = "\n".join(textwrap.wrap(c["purpose"], 28))
        label = f'{c["service"]}\\n{purpose}'
        lines.append(f'{cid}["{label}"]')

    # edges
    for c in comps:
        src = c["id"]

        for target in c.get("connects_to", []):

            if isinstance(target, dict):
                dst = target.get("id")

            elif isinstance(target, str):
                dst = target

            else:
                continue

            if dst:
                lines.append(f"{src} --> {dst}")

    return "\n".join(lines)

def get_architecture_context(req: dict):
    query_terms = extract_graph_keywords_from_requirement(req)

    docs = search_active_chunks(query_terms)
    graph_context = query_knowledge_graph(query_terms, top_k=20, min_score=1)

    graph_terms = extract_terms_from_graph_text(graph_context)
    reranked_chunks = rerank_documents_with_graph_terms(
        docs,
        query_terms,
        graph_terms,
        top_k=8
    )
    structured_evidence = build_architecture_evidence(reranked_chunks)

    return {
        "text_context": structured_evidence,
        "graph_context": graph_context,
        "requirement_type": req["requirement_type"],
        "subject": req["subject"],
        "expected_value": req.get("expected_value", "")
    }

def extract_first_balanced_json(text: str) -> str | None:
    """
    Extrai APENAS o primeiro JSON balanceado.
    Ignora QUALQUER coisa depois.
    """

    start = text.find("{")
    if start == -1:
        return None

    depth = 0

    for i, c in enumerate(text[start:], start):
        if c == "{":
            depth += 1
        elif c == "}":
            depth -= 1

            if depth == 0:
                return text[start:i+1]

    return None

def sanitize_json_string(text: str) -> str:
    return re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', text)


def recover_json_object(text: str) -> str | None:
    """
    Extrai o primeiro objeto JSON possível.
    - ignora lixo antes/depois
    - tolera truncamento
    - fecha chaves automaticamente
    """

    if not text:
        return None

    # remove markdown fences
    text = text.replace("```json", "").replace("```", "")

    start = text.find("{")
    if start == -1:
        return None

    text = text[start:]

    depth = 0
    end_pos = None

    for i, c in enumerate(text):
        if c == "{":
            depth += 1
        elif c == "}":
            depth -= 1

            if depth == 0:
                end_pos = i + 1
                break

    # ✅ caso normal → JSON completo
    if end_pos:
        return text[:end_pos]

    # 🔥 caso TRUNCADO → fecha automaticamente
    opens = text.count("{")
    closes = text.count("}")

    missing = opens - closes

    if missing > 0:
        text = text + ("}" * missing)

    text = sanitize_json_string(text)
    text = extract_first_balanced_json(text)

    return text

def safe_parse_architecture_json(raw):
    """
    Robust JSON parser for LLM output.
    Remove control chars + fix invalid escapes.
    """

    if isinstance(raw, dict):
        return raw

    if not raw:
        return {}

    raw = raw.replace("```json", "").replace("```", "").strip()

    # pega só o bloco JSON
    m = re.search(r"\{.*\}", raw, re.DOTALL)
    if not m:
        raise ValueError(f"Invalid architecture JSON:\n{raw}")

    json_text = m.group(0)

    # 🔥 remove caracteres de controle invisíveis
    json_text = re.sub(r"[\x00-\x1F\x7F]", " ", json_text)

    # 🔥 normaliza newlines
    json_text = json_text.replace("\r", " ").replace("\n", " ")

    try:
        return json.loads(json_text)
    except Exception as e:
        print(e)
        print("⚠️ JSON RAW (debug):")
        print(json_text)
        try:
            return json.loads(recover_json_object(json_text))
        except Exception as e1:
            print(e1)
            raise e1

# =========================
# Oracle Graph Client
# =========================
ANSWER_RANK = {
    "YES": 3,
    "PARTIAL": 2,
    "NO": 1
}

CONFIDENCE_RANK = {
    "HIGH": 3,
    "MEDIUM": 2,
    "LOW": 1
}

def score_answer(parsed: dict) -> int:
    ans = parsed.get("answer", "NO")
    conf = parsed.get("confidence", "LOW")
    evidence = parsed.get("evidence", [])

    # 🔹 base lógica (já existia)
    base = ANSWER_RANK.get(ans, 0) * 10 + CONFIDENCE_RANK.get(conf, 0)

    unique_sources = set()
    quote_size = 0
    source_quality = 0

    for e in evidence:
        src = (e.get("source") or "").lower()
        quote = e.get("quote", "")

        if not src:
            continue

        unique_sources.add(src)
        quote_size = 0
        if quote:
            quote_size += len(quote)

        # 🔥 pesos inteligentes por qualidade
        if "/solutions/" in src:
            source_quality += 6

        elif "youtube.com" in src:
            source_quality += 5

        elif any(x in src for x in ["architecture", "overview", "concepts", "how-to"]):
            source_quality += 4

        elif "docs.oracle.com" in src:
            source_quality += 2

        elif any(x in src for x in ["home", "index", "portal"]):
            source_quality -= 5

    evidence_score = (
            len(unique_sources) * 3 +
            min(quote_size // 500, 5) +
            source_quality
    )

    return base + evidence_score

def ensure_oracle_text_index(
        conn,
        table_name: str,
        column_name: str,
        index_name: str
):
    cursor = conn.cursor()

    cursor.execute("""
                   SELECT status
                   FROM user_indexes
                   WHERE index_name = :idx
                   """, {"idx": index_name.upper()})

    row = cursor.fetchone()
    index_exists = row is not None
    index_status = row[0] if row else None

    if not index_exists:
        print(f"🛠️ Creating Oracle Text index {index_name}")

        cursor.execute(f"""
            CREATE INDEX {index_name}
            ON {table_name} ({column_name})
            INDEXTYPE IS CTXSYS.CONTEXT
        """)

        conn.commit()
        cursor.close()
        print(f"✅ Index {index_name} created (sync deferred)")
        return

    if index_status != "VALID":
        print(f"⚠️ Index {index_name} is {index_status}. Recreating...")

        try:
            cursor.execute(f"DROP INDEX {index_name}")
            conn.commit()
        except Exception as e:
            print(f"❌ Failed to drop index {index_name}: {e}")
            cursor.close()
            return

        cursor.execute(f"""
            CREATE INDEX {index_name}
            ON {table_name} ({column_name})
            INDEXTYPE IS CTXSYS.CONTEXT
        """)
        conn.commit()
        cursor.close()
        print(f"♻️ Index {index_name} recreated (sync deferred)")
        return

    print(f"🔄 Syncing Oracle Text index: {index_name}")
    try:
        cursor.execute(f"""
            BEGIN
                CTX_DDL.SYNC_INDEX('{index_name}', '2M');
            END;
        """)
        conn.commit()
        print(f"✅ Index {index_name} synced")
    except Exception as e:
        print(f"⚠️ Sync failed for {index_name}: {e}")
        print("⚠️ Continuing without breaking pipeline")

    cursor.close()

def _col_exists(conn, table: str, col: str) -> bool:
    cur = conn.cursor()
    cur.execute("""
                SELECT 1
                FROM user_tab_cols
                WHERE table_name = :t
                  AND column_name = :c
                """, {"t": table.upper(), "c": col.upper()})
    ok = cur.fetchone() is not None
    cur.close()
    return ok


def create_tables_if_not_exist(conn):
    cursor = conn.cursor()

    try:
        # ---------------------------
        # KG_NODES
        # ---------------------------
        cursor.execute(f"""
        BEGIN
            EXECUTE IMMEDIATE '
                CREATE TABLE KG_NODES_{GRAPH_NAME} (
                    ID NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY,
                    NODE_TYPE VARCHAR2(100),
                    NAME VARCHAR2(1000),
                    DESCRIPTION CLOB,
                    PROPERTIES CLOB,
                    CREATED_AT TIMESTAMP DEFAULT SYSTIMESTAMP
                )
            ';
        EXCEPTION WHEN OTHERS THEN
            IF SQLCODE != -955 THEN RAISE; END IF;
        END;
        """)

        # ---------------------------
        # KG_EDGES (estrutura + ponte pro chunk)
        # ---------------------------
        cursor.execute(f"""
        BEGIN
            EXECUTE IMMEDIATE '
                CREATE TABLE KG_EDGES_{GRAPH_NAME} (
                    ID NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY,
                    SOURCE_ID NUMBER,
                    TARGET_ID NUMBER,
                    EDGE_TYPE VARCHAR2(100),

                    -- ✅ governança / revogação
                    CHUNK_HASH VARCHAR2(64),

                    -- ✅ link principal (melhor url do chunk)
                    SOURCE_URL VARCHAR2(2000),

                    CONFIDENCE_WEIGHT NUMBER DEFAULT 1,
                    CREATED_AT TIMESTAMP DEFAULT SYSTIMESTAMP
                )
            ';
        EXCEPTION WHEN OTHERS THEN
            IF SQLCODE != -955 THEN RAISE; END IF;
        END;
        """)

        cursor.execute(f"""
        BEGIN
            EXECUTE IMMEDIATE '
                CREATE UNIQUE INDEX KG_EDGE_UNQ_{GRAPH_NAME}
                ON KG_EDGES_{GRAPH_NAME}
                (SOURCE_ID, TARGET_ID, EDGE_TYPE, CHUNK_HASH)
            ';
        EXCEPTION WHEN OTHERS THEN
            IF SQLCODE != -955 THEN RAISE; END IF;
        END;
        """)

        # ---------------------------
        # KG_EVIDENCE (prova)
        # ---------------------------
        cursor.execute(f"""
        BEGIN
            EXECUTE IMMEDIATE '
                CREATE TABLE KG_EVIDENCE_{GRAPH_NAME} (
                    ID NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY,
                    EDGE_ID NUMBER,
                    CHUNK_HASH VARCHAR2(64),
                    SOURCE_URL VARCHAR2(2000),
                    QUOTE CLOB,
                    CREATED_AT TIMESTAMP DEFAULT SYSTIMESTAMP
                )
            ';
        EXCEPTION WHEN OTHERS THEN
            IF SQLCODE != -955 THEN RAISE; END IF;
        END;
        """)

        conn.commit()

    finally:
        cursor.close()

    # ---------------------------
    # Migração leve (se seu KG_EDGES já existia antigo)
    # ---------------------------
    edges_table = f"KG_EDGES_{GRAPH_NAME}"

    if not _col_exists(conn, edges_table, "CHUNK_HASH"):
        cur = conn.cursor()
        cur.execute(f"ALTER TABLE {edges_table} ADD (CHUNK_HASH VARCHAR2(64))")
        conn.commit()
        cur.close()
    ensure_oracle_text_index(conn, f"KG_NODES_{GRAPH_NAME}", "NAME", f"KG_NODES_{GRAPH_NAME}_NAME")

    print("✅ Graph schema (probatório) ready.")

#create_tables_if_not_exist(oracle_conn)

# IF GRAPH INDEX PROBLEM, Reindex
# ensure_oracle_text_index(
#     oracle_conn,
#     "ENTITIES_" + GRAPH_NAME,
#     "NAME",
#     "IDX_ENT_" + GRAPH_NAME + "_NAME"
# )
#
# ensure_oracle_text_index(
#     oracle_conn,
#     "RELATIONS_" + GRAPH_NAME,
#     "RELATION_TYPE",
#     "IDX_REL_" + GRAPH_NAME + "_RELTYPE"
# )

def create_knowledge_graph(chunks):

    cursor = oracle_conn.cursor()
    inserted_counter = 0
    COMMIT_BATCH = 500

    # =====================================================
    # 1️⃣ CREATE PROPERTY GRAPH (se não existir)
    # =====================================================
    try:
        cursor.execute(f"""
        BEGIN
            EXECUTE IMMEDIATE '
                CREATE PROPERTY GRAPH {GRAPH_NAME}
                VERTEX TABLES (
                    KG_NODES_{GRAPH_NAME}
                        KEY (ID)
                        LABEL NODE_TYPE
                        PROPERTIES (NAME, DESCRIPTION, PROPERTIES)
                )
                EDGE TABLES (
                    KG_EDGES_{GRAPH_NAME}
                        KEY (ID)
                        SOURCE KEY (SOURCE_ID) REFERENCES KG_NODES_{GRAPH_NAME}(ID)
                        DESTINATION KEY (TARGET_ID) REFERENCES KG_NODES_{GRAPH_NAME}(ID)
                        LABEL EDGE_TYPE
                        PROPERTIES (CHUNK_HASH, SOURCE_URL, CONFIDENCE_WEIGHT)
                )
            ';
        EXCEPTION
            WHEN OTHERS THEN
                IF SQLCODE NOT IN (-55358, -955) THEN
                    RAISE;
                END IF;
        END;
        """)
        print(f"🧠 Graph '{GRAPH_NAME}' ready.")
    except Exception as e:
        print(f"[GRAPH ERROR] {e}")

    # =====================================================
    # 2️⃣ Helper: MERGE NODE (otimizado)
    # =====================================================
    def build_default_node_properties():
        return {
            "metadata": {
                "created_by": "RFP_AI_V2",
                "version": "2.0",
                "created_at": datetime.utcnow().isoformat()
            },
            "analysis": {
                "confidence_score": None,
                "source": "DOCUMENT_RAG",
                "extraction_method": "LLM_TRIPLE_EXTRACTION"
            },
            "governance": {
                "validated": False,
                "review_required": False
            }
        }


    def ensure_node_properties_structure(properties):
        base = build_default_node_properties()

        if not properties:
            return base

        def merge(d1, d2):
            for k, v in d1.items():
                if k not in d2:
                    d2[k] = v
                elif isinstance(v, dict):
                    d2[k] = merge(v, d2.get(k, {}))
            return d2

        return merge(base, properties)

    def merge_node(node_type, name, description=None, properties=None):

        name = (name or "").strip()[:500]

        # 1️⃣ Try existing
        cursor.execute(f"""
            SELECT ID
            FROM KG_NODES_{GRAPH_NAME}
            WHERE NAME = :name_val
              AND NODE_TYPE = :node_type_val
        """, {
            "name_val": name,
            "node_type_val": node_type
        })

        row = cursor.fetchone()
        if row:
            return row[0]

        # 2️⃣ Insert safely
        node_id_var = cursor.var(oracledb.NUMBER)

        try:
            cursor.execute(f"""
                INSERT INTO KG_NODES_{GRAPH_NAME}
                (NODE_TYPE, NAME, DESCRIPTION, PROPERTIES)
                VALUES (:node_type_val, :name_val, :desc_val, :props_val)
                RETURNING ID INTO :node_id
            """, {
                "node_type_val": node_type,
                "name_val": name,
                "desc_val": description,
                "props_val": json.dumps(
                    ensure_node_properties_structure(properties)
                ),
                "node_id": node_id_var
            })

            return int(node_id_var.getvalue()[0])

        except oracledb.IntegrityError:
            # if unique constraint exists
            cursor.execute(f"""
                SELECT ID
                FROM KG_NODES_{GRAPH_NAME}
                WHERE NAME = :name_val
                  AND NODE_TYPE = :node_type_val
            """, {
                "name_val": name,
                "node_type_val": node_type
            })
            row = cursor.fetchone()
            if row:
                return row[0]
            raise

    def extract_sentence_with_term(text, term):
        sentences = re.split(r'(?<=[.!?]) +', text)
        for s in sentences:
            if term.lower() in s.lower():
                return s.strip()
        return text[:1000]

    # =====================================================
    # 3️⃣ PROCESS CHUNKS
    # =====================================================
    for doc in chunks:

        text = doc.page_content or ""
        chunk_hash_value = doc.metadata.get("chunk_hash")
        source_url = resolve_chunk_source(doc)

        if not text.strip() or not chunk_hash_value:
            continue

        prompt = f"""
        Extract explicit technical capabilities from the text below.

        Text:
        {text}

        Return triples ONLY in format:

        SERVICE -[SUPPORTS_CAPABILITY]-> CAPABILITY
        SERVICE -[DOES_NOT_SUPPORT]-> CAPABILITY
        SERVICE -[HAS_LIMITATION]-> LIMITATION
        SERVICE -[HAS_SLA]-> SLA_VALUE

        Rules:
        - Use exact service names if present
        - Use UPPERCASE relation names
        - No inference
        - If none found return NONE
        """

        try:
            response = call_llm(llm_for_rag.invoke, prompt)
            result = response.content.strip()
        except Exception as e:
            print(f"[LLM ERROR] {e}")
            continue

        if result.upper() == "NONE":
            continue

        triples = result.splitlines()

        for triple in triples:

            parts = triple.split("-[")
            if len(parts) != 2:
                continue

            right = parts[1].split("]->")
            if len(right) != 2:
                continue

            entity1 = parts[0].strip()
            raw_relation = right[0].strip().upper()
            entity2 = right[1].strip()

            MAX_NODE_NAME = 500

            entity1 = entity1[:MAX_NODE_NAME]
            entity2 = entity2[:MAX_NODE_NAME]

            relation = re.sub(r'\W+', '_', raw_relation)

            source_id = merge_node(
                node_type="SERVICE",
                name=entity1,
                description=None,
                properties={
                    "chunk_hash": chunk_hash_value,
                    "source_url": source_url
                }
            )

            if relation == "DOES_NOT_SUPPORT":
                target_type = "UNSUPPORTED_CAPABILITY"
            elif relation == "HAS_LIMITATION":
                target_type = "LIMITATION"
            elif relation == "HAS_SLA":
                target_type = "SLA"
            else:
                target_type = "CAPABILITY"

            description_text = extract_sentence_with_term(text, entity2)

            target_id = merge_node(
                node_type=target_type,
                name=entity2,
                description=description_text
            )

            # 🔥 Evitar duplicação de edge
            cursor.execute(f"""
                SELECT ID
                FROM KG_EDGES_{GRAPH_NAME}
                WHERE SOURCE_ID = :s
                  AND TARGET_ID = :t
                  AND EDGE_TYPE = :r
                  AND CHUNK_HASH = :h
            """, {
                "s": source_id,
                "t": target_id,
                "r": relation,
                "h": chunk_hash_value
            })

            if cursor.fetchone():
                continue

            # =====================================================
            # INSERT EDGE + RETURNING ID
            # =====================================================
            edge_id_var = cursor.var(oracledb.NUMBER)

            cursor.execute(f"""
                INSERT INTO KG_EDGES_{GRAPH_NAME}
                (SOURCE_ID, TARGET_ID, EDGE_TYPE, CHUNK_HASH, SOURCE_URL, CONFIDENCE_WEIGHT)
                VALUES (:src, :tgt, :rel, :h, :url, :w)
                RETURNING ID INTO :edge_id
            """, {
                "src": source_id,
                "tgt": target_id,
                "rel": relation,
                "h": chunk_hash_value,
                "url": source_url,
                "w": 1,
                "edge_id": edge_id_var
            })

            edge_id = int(edge_id_var.getvalue()[0])

            # =====================================================
            # INSERT EVIDENCE
            # =====================================================
            quote = text[:1500]

            cursor.execute(f"""
                INSERT INTO KG_EVIDENCE_{GRAPH_NAME}
                (EDGE_ID, CHUNK_HASH, SOURCE_URL, QUOTE)
                VALUES (:eid, :h, :url, :q)
            """, {
                "eid": edge_id,
                "h": chunk_hash_value,
                "url": source_url,
                "q": quote
            })

            inserted_counter += 1

            print(f"✅ {entity1} -[{relation}]-> {entity2}")

            # =====================================================
            # COMMIT A CADA 500
            # =====================================================
            if inserted_counter % COMMIT_BATCH == 0:
                oracle_conn.commit()
                print(f"💾 Batch commit ({inserted_counter} records)")

    # Commit final
    oracle_conn.commit()
    cursor.close()

    print(f"💾 Knowledge graph updated. Total inserted: {inserted_counter}")

def parse_rfp_requirement(question: str) -> dict:
    prompt = f"""
        You are an RFP requirement NORMALIZER for Oracle Cloud Infrastructure (OCI).

        Your job is NOT to summarize the question.
        Your job is to STRUCTURE the requirement so it can be searched in:
        - Technical documentation
        - Knowledge Graph
        - Vector databases

        ────────────────────────────────
        STEP 1 — Understand the requirement
        ────────────────────────────────
        From the question, identify:
        1. The PRIMARY OCI SERVICE CATEGORY involved
        2. The MAIN TECHNICAL SUBJECT (short and precise)
        3. The EXPECTED TECHNICAL CAPABILITY or CONDITION (if any)

        IMPORTANT:
        - Ignore marketing language
        - Ignore phrases like "possui", "permite", "oferece"
        - Focus ONLY on concrete technical meaning

        ────────────────────────────────
        STEP 2 — Mandatory service classification
        ────────────────────────────────
        You MUST choose ONE primary technology from the list below
        and INCLUDE IT EXPLICITLY in the keywords list.

        Choose the MOST SPECIFIC applicable item.

        Serviços da Oracle Cloud Infrastructure (OCI):

            Compute (IaaS)
                •	Compute Instances (VM)
                •	Bare Metal Instances
                •	Dedicated VM Hosts
                •	GPU Instances
                •	Confidential Computing
                •	Capacity Reservations
                •	Autoscaling (Instance Pools)
                •	Live Migration
                •	Oracle Cloud VMware Solution (OCVS)
                •	HPC (High Performance Computing)
                •	Arm-based Compute (Ampere)

            Storage

                Object Storage
                •	Object Storage
                •	Object Storage – Archive
                •	Pre-Authenticated Requests
                •	Replication

                Block & File
                •	Block Volume
                •	Boot Volume
                •	Volume Groups
                •	File Storage
                •	File Storage Snapshots
                •	Data Transfer Service

            Networking
                •	Virtual Cloud Network (VCN)
                •	Subnets
                •	Internet Gateway
                •	NAT Gateway
                •	Service Gateway
                •	Dynamic Routing Gateway (DRG)
                •	FastConnect
                •	Load Balancer (L7 / L4)
                •	Network Load Balancer
                •	DNS
                •	Traffic Management Steering Policies
                •	IP Address Management (IPAM)
                •	Network Firewall
                •	Web Application Firewall (WAF)
                •	Bastion
                •	Capture Traffic (VTAP)
                •	Private Endpoints

            Security, Identity & Compliance
                •	Identity and Access Management (IAM)
                •	Compartments
                •	Policies
                •	OCI Vault
                •	OCI Key Management (KMS)
                •	OCI Certificates
                •	OCI Secrets
                •	OCI Bastion
                •	Cloud Guard
                •	Security Zones
                •	Vulnerability Scanning Service
                •	Data Safe
                •	Logging
                •	Audit
                •	OS Management / OS Management Hub
                •	Shielded Instances
                •	Zero Trust Packet Routing

            Databases

                Autonomous
                    •	Autonomous Database (ATP)
                    •	Autonomous Data Warehouse (ADW)
                    •	Autonomous JSON Database

                Databases Gerenciados
                    •	Oracle Database Service
                    •	Oracle Exadata Database Service
                    •	Exadata Cloud@Customer
                    •	Base Database Service
                    •	MySQL Database Service
                    •	MySQL HeatWave
                    •	NoSQL Database Cloud Service
                    •	TimesTen
                    •	PostgreSQL (OCI managed)
                    •	MongoDB API (OCI NoSQL compatibility)

            Analytics & BI
                •	Oracle Analytics Cloud (OAC)
                •	OCI Data Catalog
                •	OCI Data Integration
                •	OCI Streaming Analytics
                •	OCI GoldenGate
                •	OCI Big Data Service (Hadoop/Spark)
                •	OCI Data Science
                •	OCI AI Anomaly Detection
                •	OCI AI Forecasting

            AI & Machine Learning

                Generative AI
                •	OCI Generative AI
                •	OCI Generative AI Agents
                •	OCI Generative AI RAG
                •	OCI Generative AI Embeddings
                •	OCI AI Gateway (OpenAI-compatible)

            AI Services
                •	OCI Vision (OCR, image analysis)
                •	OCI Speech (STT / TTS)
                •	OCI Language (NLP)
                •	OCI Document Understanding
                •	OCI Anomaly Detection
                •	OCI Forecasting
                •	OCI Data Labeling

            Containers & Cloud Native
                •	OCI Container Engine for Kubernetes (OKE)
                •	Container Registry (OCIR)
                •	Service Mesh
                •	API Gateway
                •	OCI Functions (FaaS)
                •	OCI Streaming (Kafka-compatible)
                •	OCI Queue
                •	OCI Events
                •	OCI Resource Manager (Terraform)

            Integration & Messaging
                •	OCI Integration Cloud (OIC)
                •	OCI Service Connector Hub
                •	OCI Streaming
                •	OCI GoldenGate
                •	OCI API Gateway
                •	OCI Events Service
                •	OCI Queue
                •   Real Applications Clusters (RAC)

            Developer Services
                •	OCI DevOps (CI/CD)
                •	OCI Code Repository
                •	OCI Build Pipelines
                •	OCI Artifact Registry
                •	OCI Logging Analytics
                •	OCI Monitoring
                •	OCI Notifications
                •	OCI Bastion
                •	OCI CLI
                •	OCI SDKs

            Observability & Management
                •	OCI Monitoring
                •	OCI Alarms
                •	OCI Logging
                •	OCI Logging Analytics
                •	OCI Application Performance Monitoring (APM)
                •	OCI Operations Insights
                •	OCI Management Agent
                •	OCI Resource Discovery

            Enterprise & Hybrid
                •	Oracle Cloud@Customer
                •	Exadata Cloud@Customer
                •	Compute Cloud@Customer
                •	Dedicated Region Cloud@Customer
                •	OCI Roving Edge Infrastructure
                •	OCI Alloy

            Governance & FinOps
                •	OCI Budgets
                •	Cost Analysis
                •	Usage Reports
                •	Quotas
                •	Tagging
                •	Compartments
                •	Resource Search

            Regions & Edge
                •	OCI Regions (Commercial, Government, EU Sovereign)
                •	OCI Edge Services
                •	OCI Roving Edge
                •	OCI Dedicated Region

        ────────────────────────────────
        STEP 3 — Keywords rules (CRITICAL)
        ────────────────────────────────
        The "keywords" field MUST:
        - ALWAYS include at least ONE OCI service keyword (e.g. "compute", "object storage", "oke")
        - Include technical capability terms (e.g. resize, autoscaling, encryption)
        - NEVER include generic verbs (permitir, possuir, oferecer)
        - NEVER include full sentences

        ────────────────────────────────
        STEP 4 — Output rules
        ────────────────────────────────
        Return ONLY valid JSON between <json> tags.
        Do NOT explain your reasoning.

        Question:
        {question}

        <json>
        {{
          "requirement_type": "COMPLIANCE | FUNCTIONAL | NON_FUNCTIONAL",
          "subject": "<short technical subject, e.g. 'Compute Instances'>",
          "expected_value": "<technical capability or condition, or empty string>",
          "decision_type": "YES_NO | YES_NO_PARTIAL",
          "keywords": ["mandatory_oci_service", "technical_capability", "additional_term"]
        }}
        </json>
        """

    # resp = llm_for_rag.invoke(prompt)
    resp = call_llm(llm_for_rag.invoke, prompt)
    raw = resp.content.strip()

    try:
        # remove ```json ``` ou ``` ```
        raw = re.sub(r"```json|```", "", raw).strip()

        match = re.search(r"<json>\s*(\{.*?\})\s*</json>", raw, re.DOTALL)
        if not match:
            raise ValueError("No JSON block found")
        json_text = match.group(1)

        return json.loads(json_text)

    except Exception as e:
        print("⚠️ RFP PARSER FAILED")
        print("RAW RESPONSE:")
        print(raw)

        return {
            "requirement_type": "UNKNOWN",
            "subject": question,
            "expected_value": "",
            "decision_type": "YES_NO_PARTIAL",
            "keywords": re.findall(r"\b\w+\b", question.lower())[:5]
        }

def extract_graph_keywords_from_requirement(req: dict) -> str:
    keywords = set(req.get("keywords", []))
    if req.get("subject"):
        keywords.add(req["subject"].lower())
    if req.get("expected_value"):
        keywords.add(str(req["expected_value"]).lower())
    return ", ".join(sorted(keywords))

STOPWORDS = {
    "and", "or", "not",
    "de", "da", "do", "das", "dos", "a", "o", "as", "os", "e", "em", "no", "na", "nos", "nas", "para", "por", "com"
}

def build_oracle_text_query(text: str) -> Optional[str]:
    if not text:
        return None

    text = strip_accents(text.lower())

    # pega sequências "fraseáveis" (com espaços/hífens)
    phrases = re.findall(r"[a-z0-9][a-z0-9\- ]{2,}", text)

    tokens: list[str] = []

    for p in phrases:
        p = p.strip()
        if not p:
            continue

        # 1) quebra em palavras e hífens (word-level)
        #    Ex: "store-and-forward" -> ["store", "and", "forward"]
        words = re.findall(r"[a-z0-9]+", p)

        # 2) remove stopwords e palavras curtas
        words = [w for w in words if w not in STOPWORDS and len(w) >= 4]

        if not words:
            continue

        # 3) recombina
        if len(words) == 1:
            tokens.append(words[0])
        else:
            # se quiser manter hífen, você teria que remontar com '-' e sempre entre aspas
            # aqui eu normalizo pra espaço (mais seguro no Oracle Text)
            tokens.append(f"\"{' '.join(words)}\"")

    tokens = sorted(set(tokens))
    return " OR ".join(tokens) if tokens else None

def detect_negative_conflict(graph_context, req):

    expected = (req.get("expected_value") or "").lower()
    subject = (req.get("subject") or "").lower()

    for row in graph_context:
        service, edge_type, target, *_ = row

        if edge_type == "DOES_NOT_SUPPORT":
            if expected in target.lower() or subject in target.lower():
                return {
                    "conflict": True,
                    "service": service,
                    "capability": target
                }

    return {"conflict": False}

def query_knowledge_graph(raw_keywords: str, top_k: int = 20, min_score: int = 0):

    cursor = oracle_conn.cursor()

    safe_query = build_oracle_text_query(raw_keywords)
    if not safe_query:
        cursor.close()
        return []

    sql = f"""
    select * FROM (
        SELECT
            s.NAME        AS service_name,
            e.EDGE_TYPE   AS relation_type,
            t.NAME        AS target_name,
            e.SOURCE_URL,
            e.CONFIDENCE_WEIGHT,
            CASE
                WHEN CONTAINS(s.NAME, '{safe_query}') > 0 AND CONTAINS(t.NAME, '{safe_query}') > 0 THEN 3
                WHEN CONTAINS(s.NAME, '{safe_query}') > 0 THEN 2
                WHEN CONTAINS(t.NAME, '{safe_query}') > 0 THEN 1
                ELSE 0
            END AS relevance_score
        FROM KG_EDGES_{GRAPH_NAME} e
        JOIN KG_NODES_{GRAPH_NAME} s ON s.ID = e.SOURCE_ID
        JOIN KG_NODES_{GRAPH_NAME} t ON t.ID = e.TARGET_ID
        WHERE s.NODE_TYPE = 'SERVICE'
          AND (
                CONTAINS(t.NAME, '{safe_query}') > 0
             OR CONTAINS(s.NAME, '{safe_query}') > 0
          )
          AND e.CHUNK_HASH NOT IN (
                SELECT CHUNK_HASH
                FROM RAG_CHUNKS_GOV
                WHERE STATUS = 'REVOKED'
          )
      )
      WHERE relevance_score >= {min_score}
      AND CONFIDENCE_WEIGHT > 0
    ORDER BY relevance_score DESC
    FETCH FIRST {top_k} ROWS ONLY
    """

    print(sql)

    cursor.execute(sql)
    rows = cursor.fetchall()
    cursor.close()

    for row in rows:
        print(row)

    return rows

# RE-RANK

def extract_terms_from_graph_text(graph_context):
    if not graph_context:
        return set()

    if isinstance(graph_context, list):
        terms = set()
        for row in graph_context:
            for col in row:
                if isinstance(col, str):
                    terms.add(col.lower())
        return terms

    if isinstance(graph_context, str):
        terms = set()
        pattern = re.findall(r"([\w\s]+)-$begin:math:display$\[\\w\_\]\+$end:math:display$->([\w\s]+)", graph_context)
        for e1, e2 in pattern:
            terms.add(e1.strip().lower())
            terms.add(e2.strip().lower())
        return terms

    return set()

def rerank_documents_with_graph_terms(docs, query, graph_terms, top_k=12, per_source=2):
    query_terms = set(re.findall(r'\b\w+\b', query.lower()))
    all_terms = query_terms.union(graph_terms)

    scored = []

    for doc in docs:
        text = doc.page_content.lower()
        src = (doc.metadata.get("source") or "").lower()

        term_hits = sum(text.count(t) for t in all_terms)

        density = term_hits / max(len(text.split()), 1)

        url_score = score_arch_url(src)

        score = (term_hits * 2) + (density * 20) + url_score

        scored.append((score, doc))

    scored.sort(key=lambda x: x[0], reverse=True)

    selected = []
    by_source = {}

    for score, doc in scored:
        src = doc.metadata.get("source")

        if by_source.get(src, 0) >= per_source:
            continue

        selected.append(doc)
        by_source[src] = by_source.get(src, 0) + 1

        if len(selected) >= top_k:
            break

    return selected

def load_processed_hashes_from_graph():
    cursor = oracle_conn.cursor()
    cursor.execute(f"""
        SELECT DISTINCT CHUNK_HASH
        FROM KG_EDGES_{GRAPH_NAME}
    """)
    hashes = {r[0] for r in cursor.fetchall()}
    cursor.close()
    return hashes

def rebuild_graph_from_faiss(
        faiss_path: str,
        reverse_resume: bool = True,
        consecutive_threshold: int = 20
):
    from langchain_community.vectorstores import FAISS

    print("🔄 Loading FAISS index...")

    vectorstore = FAISS.load_local(
        faiss_path,
        embeddings,
        allow_dangerous_deserialization=True
    )

    docs = list(vectorstore.docstore._dict.values())
    print(f"📄 {len(docs)} chunks loaded")

    if reverse_resume:
        print("🔁 Reverse resume mode active")

        processed_hashes = load_processed_hashes_from_graph()

        docs_to_process = []
        consecutive_processed = 0

        for d in reversed(docs):
            h = d.metadata.get("chunk_hash")

            if h in processed_hashes:
                consecutive_processed += 1
                if consecutive_processed >= consecutive_threshold:
                    print("🛑 Boundary detected. Stopping reverse scan.")
                    break
                continue
            else:
                consecutive_processed = 0
                docs_to_process.append(d)

        docs = list(reversed(docs_to_process))
        print(f"🚀 Will process {len(docs)} chunks")

    for chunk in tqdm(docs, desc="🧠 Building Graph"):
        create_knowledge_graph([chunk])

    print("✅ Graph rebuild completed.")

# SEMANTIC CHUNKING

def split_llm_output_into_chapters(llm_text):
    chapters = []
    current_chapter = []
    lines = llm_text.splitlines()

    for line in lines:
        if re.match(chapter_separator_regex, line):
            if current_chapter:
                chapters.append("\n".join(current_chapter).strip())
            current_chapter = [line]
        else:
            current_chapter.append(line)

    if current_chapter:
        chapters.append("\n".join(current_chapter).strip())

    return chapters


def semantic_chunking(text):
    prompt = f"""
    You received the following text extracted via OCR:

    {text}

    Your task:
    1. Identify headings (short uppercase or bold lines, no period at the end) putting the Product Name (Application Name) and the Subject
    2. Separate paragraphs by heading
    3. Indicate columns with [COLUMN 1], [COLUMN 2] if present
    4. Indicate tables with [TABLE] in markdown format
    5. ALWAYS PUT THE URL if there is a Reference
    6. Indicate explicity metrics (if it exists)
       Examples:
         - Oracle Financial Services RTO is 1 hour
         - The Oracle Banking Supply Chain Finance Cloud Service A maximum number of 10K Hosted Transactions
         - The Oracle Banking Payments Cloud Service, Additional Non-Production Environment: You may purchase up to a maximum of ten (10) additional Non-Production Environments
    """

    get_out = False
    while not get_out:
        try:
            # response = llm_for_rag.invoke(prompt)
            response = call_llm(llm_for_rag.invoke, prompt)
            get_out = True
        except:
            print("[ERROR] Gen AI call error")

    return response

def read_pdfs(pdf_path):
    if "-ocr" in pdf_path:
        doc_pages = PyMuPDFLoader(str(pdf_path)).load()
    else:
        doc_pages = UnstructuredPDFLoader(str(pdf_path)).load()
    full_text = "\n".join([page.page_content for page in doc_pages])
    return full_text


def smart_split_text(text, max_chunk_size=10_000):
    chunks = []
    start = 0
    text_length = len(text)

    while start < text_length:
        end = min(start + max_chunk_size, text_length)
        split_point = max(
            text.rfind('.', start, end),
            text.rfind('!', start, end),
            text.rfind('?', start, end),
            text.rfind('\n\n', start, end)
        )
        if split_point == -1 or split_point <= start:
            split_point = end
        else:
            split_point += 1

        chunk = text[start:split_point].strip()
        if chunk:
            chunks.append(chunk)

        start = split_point

    return chunks


def load_previously_indexed_docs():
    if os.path.exists(PROCESSED_DOCS_FILE):
        with open(PROCESSED_DOCS_FILE, "rb") as f:
            return pickle.load(f)
    return set()


def save_indexed_docs(docs):
    with open(PROCESSED_DOCS_FILE, "wb") as f:
        pickle.dump(docs, f)

def retrieve_active_docs(query_terms: str, k: int = 50):
    docs = retriever.invoke(query_terms)

    hashes = [d.metadata.get("chunk_hash") for d in docs if d.metadata.get("chunk_hash")]
    if not hashes:
        return docs[:k]

    cursor = oracle_conn.cursor()
    cursor.execute("""
                   SELECT chunk_hash
                   FROM RAG_CHUNKS_GOV
                   WHERE chunk_hash IN (SELECT COLUMN_VALUE FROM TABLE(:hashes))
                     AND status = 'ACTIVE'
                   """, {"hashes": hashes})

    active = {r[0] for r in cursor.fetchall()}
    cursor.close()

    return [d for d in docs if d.metadata.get("chunk_hash") in active][:k]

URL_REGEX = re.compile(r"https?://[^\s\)\]\}<>\"']+", re.IGNORECASE)

def best_url(urls):
    def score(u):
        u = u.lower()
        s = 0

        if "docs.oracle.com" in u:
            s += 3

        if any(x in u for x in [
            "compute","database","oke","storage","network","security"
        ]):
            s += 5

        if any(x in u for x in [
            "overview","architecture","concepts","how-to","use-case"
        ]):
            s += 4

        if any(x in u for x in [
            "home","index","portal","release-notes","faq"
        ]):
            s -= 10

        return s

    return max(urls, key=score)

def resolve_chunk_source(doc):
    text = doc.page_content or ""
    md = doc.metadata or {}

    # 🔥 1) URLs reais dentro do conteúdo
    text_urls = URL_REGEX.findall(text)

    # remove urls genéricas
    text_urls = [
        u for u in text_urls
        if not any(x in u.lower() for x in ["home", "index", "portal"])
    ]

    if text_urls:
        return best_url(text_urls)

    # 🔥 2) reference (geralmente melhor que source)
    ref = md.get("reference")
    if ref and ref.startswith("http"):
        return ref

    # 🔥 3) source
    src = md.get("source")
    if src and src.startswith("http"):
        return src

    return "Oracle Cloud Infrastructure documentation"

def extract_first_url_from_chunk(*texts: str) -> str | None:
    """
    Recebe múltiplos textos (chunk, metadata, etc) e retorna a PRIMEIRA URL encontrada.
    """
    for text in texts:
        if not text:
            continue
        m = URL_REGEX.search(text)
        if m:
            return m.group(0)
    return None

def aggregate_chunks_by_source(docs):
    buckets = {}

    for d in docs:
        md = d.metadata or {}
        key = (
                extract_first_url_from_chunk(
                    d.page_content,
                    md.get("reference", ""),
                    md.get("source", "")
                )
                or md.get("reference")
                or md.get("source")
                or "UNKNOWN"
        )

        buckets.setdefault(key, []).append(d.page_content)

    return buckets

def search_active_chunks(statement: str, k: int = 3000):
    docs = retriever.invoke(statement)

    hashes = [
        d.metadata.get("chunk_hash")
        for d in docs
        if d.metadata.get("chunk_hash")
    ]

    if not hashes:
        return docs[:k]

    in_clause = ",".join(f"'{h}'" for h in hashes)

    sql = f"""
        SELECT chunk_hash
        FROM RAG_CHUNKS_GOV
        WHERE status = 'ACTIVE'
          AND chunk_hash IN ({in_clause})
    """

    cursor = oracle_conn.cursor()
    cursor.execute(sql)

    active_hashes = {r[0] for r in cursor.fetchall()}
    cursor.close()

    final_docs = []
    for d in docs:
        h = d.metadata.get("chunk_hash")
        if h in active_hashes:
            d.metadata["source"] = resolve_chunk_source(d)
            final_docs.append(d)

    return final_docs[:k]

def search_manual_chunks_by_text(statement: str):
    sql = """
          SELECT
              chunk_hash,
              source,
              created_at,
              origin,
              status
          FROM rag_chunks_gov
          WHERE status = 'ACTIVE'
            AND (
              LOWER(source) LIKE '%' || LOWER(:q) || '%'
                  OR LOWER(origin) LIKE '%' || LOWER(:q) || '%'
                  OR LOWER(chunk_hash) LIKE '%' || LOWER(:q) || '%'
              )
          ORDER BY created_at DESC
              FETCH FIRST 50 ROWS ONLY \
          """

    cursor = oracle_conn.cursor()
    cursor.execute(sql, {"q": statement})
    rows = cursor.fetchall()
    cursor.close()

    from langchain.schema import Document

    docs = []
    for h, source, created_at, origin, status in rows:
        docs.append(
            Document(
                page_content="",
                metadata={
                    "chunk_hash": h,
                    "source": source,
                    "created_at": created_at,
                    "origin": origin,
                    "status": status,
                }
            )
        )

    return docs

def search_chunks_for_invalidation(statement: str, k: int = 3000):
    results = []

    manual_chunks = search_manual_chunks_by_text(statement)
    results.extend(manual_chunks)

    semantic_chunks = search_active_chunks(statement, k)

    seen = set()
    final = []

    for d in results + semantic_chunks:
        h = d.metadata.get("chunk_hash")
        if h and h not in seen:
            seen.add(h)
            final.append(d)

    return final

def revoke_chunk_by_hash(chunk_hash: str, reason: str):
    cursor = oracle_conn.cursor()
    cursor.execute("""
                   UPDATE RAG_CHUNKS_GOV
                   SET status = 'REVOKED',
                       revoked_at = SYSTIMESTAMP,
                       revocation_reason = :reason
                   WHERE chunk_hash = :h
                   """, {"h": chunk_hash, "reason": reason})

    cursor.execute(f"""
        UPDATE KG_EDGES_{GRAPH_NAME}
        SET CONFIDENCE_WEIGHT = 0
        WHERE SOURCE_URL = :h
    """, {"h": chunk_hash})


    oracle_conn.commit()
    cursor.close()

def get_chunk_metadata(chunk_hashes: list[str]) -> dict:
    if not chunk_hashes:
        return {}

    cursor = oracle_conn.cursor()

    cursor.execute(f"""
        SELECT
            CHUNK_HASH,
            ORIGIN,
            CREATED_AT,
            STATUS
        FROM RAG_CHUNKS_GOV
        WHERE CHUNK_HASH IN ({",".join([f":h{i}" for i in range(len(chunk_hashes))])})
    """, {f"h{i}": h for i, h in enumerate(chunk_hashes)})

    rows = cursor.fetchall()
    cursor.close()

    return {
        r[0]: {
            "origin": r[1],
            "created_at": r[2],
            "status": r[3]
        }
        for r in rows
    }

def add_manual_knowledge_entry(
        *,
        text: str,
        author: str,
        reason: str,
        source: str = "MANUAL_INPUT",
        origin: str = "MANUAL",
        index_path: str = INDEX_PATH,
        also_update_graph: bool = True,
) -> str:
    text = (text or "").strip()
    reason = (reason or "").strip()
    author = (author or "").strip() or "unknown"

    h = chunk_hash(text)

    doc = Document(
        page_content=text,
        metadata={
            "source": source,
            "author": author,
            "reason": reason,
            "origin": origin,
            "created_at": datetime.utcnow().isoformat(),
            "chunk_hash": h,
        },
    )

    cur = oracle_conn.cursor()
    cur.execute(
        """
        MERGE INTO RAG_CHUNKS_GOV g
        USING (SELECT :h AS h FROM dual) src
        ON (g.CHUNK_HASH = src.h)
        WHEN NOT MATCHED THEN
          INSERT (CHUNK_HASH, SOURCE, STATUS, CREATED_AT, ORIGIN)
          VALUES (:h, :src, 'ACTIVE', SYSTIMESTAMP, :origin)
        """,
        {"h": h, "src": source, "origin": origin},
    )
    oracle_conn.commit()
    cur.close()

    try:
        vs = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
        vs.add_documents([doc])
    except Exception:
        vs = FAISS.from_documents([doc], embedding=embeddings)

    vs.save_local(index_path)

    if also_update_graph:
        try:
            create_knowledge_graph([doc])
        except Exception:
            pass

    return h

def build_structured_evidence(docs, max_chunks=150):
    evidence = []

    for d in docs[:max_chunks]:
        quote = d.page_content[:3000]

        # 🔥 remove qualquer Reference textual
        quote = re.sub(r"Reference:\s*\S+", "", quote)

        evidence.append({
            "quote": quote,
            "source": resolve_chunk_source(d)
        })

    return evidence

def get_context_from_requirement(req: dict):
    query_terms = extract_graph_keywords_from_requirement(req)

    docs = search_active_chunks(query_terms)

    graph_context = query_knowledge_graph(query_terms, top_k=50, min_score=1)

    neg = detect_negative_conflict(graph_context, req)

    if neg["conflict"]:
        print("⚠️ Negative capability found in graph.")
        graph_context.append((
            neg["service"],
            "NEGATIVE_CONFLICT_DETECTED",
            neg["capability"],
            None,
            999
        ))

    graph_terms = extract_terms_from_graph_text(graph_context)
    reranked_chunks = rerank_documents_with_graph_terms(
        docs,
        query_terms,
        graph_terms,
        top_k=30
    )
    structured_evidence = build_structured_evidence(reranked_chunks)

    return {
        "text_context": structured_evidence,
        "graph_context": graph_context,
        "requirement_type": req["requirement_type"],
        "subject": req["subject"],
        "expected_value": req.get("expected_value", "")
    }

# =========================
# Main Function
# =========================
def chat():
    PDF_FOLDER = Path("docs")  # pasta onde estão os PDFs

    pdf_paths = sorted(
        str(p) for p in PDF_FOLDER.glob("*.pdf")
    )

    already_indexed_docs = load_previously_indexed_docs()
    updated_docs = set()

    try:
        vectorstore = FAISS.load_local(INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
        print("✔️ FAISS index loaded.")
    except Exception:
        print("⚠️ FAISS index not found, creating a new one.")
        vectorstore = None

    new_chunks = []

    for pdf_path in tqdm(pdf_paths, desc=f"📄 Processing PDFs"):
        print(f" {os.path.basename(pdf_path)}")
        if pdf_path in already_indexed_docs:
            print(f"✅ Document already indexed: {pdf_path}")
            continue
        full_text = read_pdfs(pdf_path=pdf_path)
        path_url = filename_to_url(os.path.basename(pdf_path))

        text_chunks = smart_split_text(full_text, max_chunk_size=10_000)
        overflow_buffer = ""

        for chunk in tqdm(text_chunks, desc=f"📄 Processing text chunks", dynamic_ncols=True, leave=False):
            current_text = overflow_buffer + chunk

            treated_text = semantic_chunking(current_text)

            if hasattr(treated_text, "content"):
                chapters = split_llm_output_into_chapters(treated_text.content)

                last_chapter = chapters[-1] if chapters else ""

                if last_chapter and not last_chapter.strip().endswith((".", "!", "?", "\n\n")):
                    print("📌 Last chapter seems incomplete, saving for the next cycle")
                    overflow_buffer = last_chapter
                    chapters = chapters[:-1]
                else:
                    overflow_buffer = ""

                for chapter_text in chapters:
                    reference_url = "Reference: " + path_url
                    chapter_text = chapter_text + "\n" + reference_url
                    # doc = Document(page_content=chapter_text, metadata={"source": pdf_path, "reference": reference_url})

                    h = chunk_hash(chapter_text)

                    cursor = oracle_conn.cursor()

                    cursor.execute("""
                    MERGE INTO RAG_CHUNKS_GOV g
                    USING (SELECT :h AS h FROM dual) src
                    ON (g.CHUNK_HASH = src.h)
                    WHEN NOT MATCHED THEN
                      INSERT (
                        CHUNK_HASH,
                        SOURCE,
                        STATUS,
                        CREATED_AT,
                        ORIGIN
                      )
                      VALUES (
                        :h,
                        :src,
                        'ACTIVE',
                        SYSTIMESTAMP,
                        :origin
                      )
                    """, {
                        "h": h,
                        "src": pdf_path,
                        "origin": "PDF"
                    })
                    oracle_conn.commit()
                    cursor.close()

                    doc = Document(
                        page_content=chapter_text,
                        metadata={
                            "source": pdf_path,
                            "reference": reference_url,
                            "chunk_hash": h,
                            "created_at": datetime.utcnow().isoformat()
                        }
                    )

                    new_chunks.append(doc)
                    print(f"✅ New chapter indexed:\n{chapter_text}...\n")

            else:
                print(f"[ERROR] semantic_chunking returned unexpected type: {type(treated_text)}")

        updated_docs.add(str(pdf_path))

    if new_chunks:
        if vectorstore:
            vectorstore.add_documents(new_chunks)
        else:
            vectorstore = FAISS.from_documents(new_chunks, embedding=embeddings)

        vectorstore.save_local(INDEX_PATH)
        save_indexed_docs(already_indexed_docs.union(updated_docs))
        print(f"💾 {len(new_chunks)} chunks added to FAISS index.")

        print("🧠 Building knowledge graph...")
        create_knowledge_graph(new_chunks)

    else:
        print("📁 No new documents to index.")

    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 50, "fetch_k": 100})

    RFP_DECISION_TEMPLATE = """
    You are answering an RFP requirement with risk awareness.

    You MUST validate the answer ONLY using CAPABILITY nodes returned in GRAPH FACTS.
    If no capability exists, answer MUST be "UNKNOWN".

    Requirement:
    Type: {requirement_type}
    Subject: {subject}
    Expected value: {expected_value}

    **Document evidence**:
    {text_context}

    **Graph evidence**:
    {graph_context}

    Decision rules:
    - Answer ONLY with YES, NO or PARTIAL
    - If value differs, answer PARTIAL
    - If not found, answer NO

    Interpretation rules (MANDATORY):
    - If a capability is supported but requires reboot, downtime, or restart, it STILL counts as YES unless the requirement explicitly forbids it.
    - "Servidor em funcionamento" means the resource exists and is active before the operation, not that it must remain online without interruption.
    - Only answer NO if the operation is NOT supported at all or requires destroying and recreating the resource.
    - Reboot, restart, or brief unavailability MUST NOT be interpreted as lack of support.

    Confidence rules:
    - HIGH: Explicit evidence directly answers the requirement
    - MEDIUM: Evidence partially matches or requires light interpretation
    - LOW: Requirement is ambiguous OR evidence is indirect OR missing

    Ambiguity rules:
    - ambiguity_detected = true if:
      - The requirement can be interpreted in more than one way
      - Keywords are vague (e.g. "support", "integration", "capability")
      - Evidence does not clearly bind to subject + expected value

    Service scope rules (MANDATORY):
    - Evidence is valid ONLY if it refers to the SAME service category as the requirement.
    - Do NOT use evidence from a different Oracle Cloud service to justify another.
    - PREFER ALWAYS URL to source

    SOURCE RULE:
    - GIVE MANY SOURCES URL
    - GIVE A COMPLETE PATH OF URL SOURCES TO UNDERSTAND THE CONCEPTS THEME
        - GIVE one or more OVERVIEW SOURCE URL
        - GIVE one or more SOLUTION AND ARCHITECTURE SOURCE URL

    OUTPUT CONSTRAINTS (MANDATORY):
    - Return ONLY a valid JSON object
    - Do NOT include explanations, comments, markdown, lists, or code fences
    - Do NOT write any text before or after the JSON
    - The response must start with an opening curly brace and end with a closing curly brace

    LANGUAGE RULE (MANDATORY): **DO TRANSLATION AS THE LAST STEP**
    - Write ALL textual values in {lang}
    - Keep JSON keys in English
    - Do NOT translate keys

    JSON schema (return exactly this structure):
    {{
      "answer": "YES | NO | PARTIAL",
      "confidence": "HIGH | MEDIUM | LOW",
      "ambiguity_detected": true,
      "confidence_reason": "<short reason>",
      "justification": "<short factual explanation>",
      "evidence": [
        {{ "quote": "...",
            "source": "..." }},
        {{ "quote": "...",
            "source": "..." }},
        {{ "quote": "...",
            "source": "..." }},
      ]
    }}
    """
    prompt = PromptTemplate.from_template(RFP_DECISION_TEMPLATE)

    chain = (
            RunnableLambda(lambda q: {
                "question": q,
                "req": parse_rfp_requirement(q)
            })
            | RunnableMap({
        "text_context": lambda x: get_context_from_requirement(x["req"])["text_context"],
        "graph_context": lambda x: get_context_from_requirement(x["req"])["graph_context"],
        "requirement_type": lambda x: x["req"]["requirement_type"],
        "subject": lambda x: x["req"]["subject"],
        "expected_value": lambda x: x["req"].get("expected_value", ""),
        "lang": lambda x: normalize_lang(detect(x["question"]))
    })
            | prompt
            | llm
            | StrOutputParser()
    )

    print("✅ READY")

    while True:
        query = input("❓ Question (or 'quit' to exit): ")
        if query.lower() == "quit":
            break
        # response = chain.invoke(query)
        response = answer_question_with_retries(query, max_attempts=3)

        print("\n📜 RESPONSE:\n")
        print(response)
        print("\n" + "=" * 80 + "\n")


def safe_parse_llm_answer(raw: str) -> dict:
    try:
        raw = raw.replace("```json", "").replace("```", "").strip()
        return json.loads(raw)
    except Exception:
        return {
            "answer": "NO",
            "confidence": "LOW",
            "confidence_reason": "Invalid JSON from LLM",
            "justification": "",
            "evidence": []
        }

def answer_question_with_retries(
        question: str,
        max_attempts: int = 3
) -> dict:

    best = None
    best_score = -1

    # 🔥 Importante: precisamos do graph_context aqui
    req = parse_rfp_requirement(question)
    graph_context = query_knowledge_graph(
        extract_graph_keywords_from_requirement(req),
        top_k=20
    )

    for attempt in range(1, max_attempts + 1):

        raw = answer_question(question)
        parsed = safe_parse_llm_answer(raw)

        # =====================================================
        # 🔥 AQUI entra o tratamento de conflito negativo
        # =====================================================
        if parsed.get("answer") == "YES":
            for row in graph_context:
                service, edge_type, target, *_ = row

                if edge_type in ("DOES_NOT_SUPPORT", "NEGATIVE_CONFLICT_DETECTED"):
                    print("❌ Conflict detected — forcing downgrade to NO")

                    parsed["answer"] = "NO"
                    parsed["confidence"] = "HIGH"
                    parsed["confidence_reason"] = \
                        "Graph contains explicit negative capability"

                    break

        # =====================================================

        ans = parsed.get("answer", "NO")
        conf = parsed.get("confidence", "LOW")

        score = score_answer(parsed)

        print(
            f"🔁 Attempt {attempt}: "
            f"answer={ans} confidence={conf} score={score}"
        )

        if score > best_score:
            best = parsed
            best_score = score

        # condição de parada ideal
        if ans == "YES" and conf == "HIGH":
            print("✅ Optimal answer found (YES/HIGH)")
            return parsed

    print("⚠️ Optimal answer not found, returning best available")

    best = validate_and_sanitize_sources(best)

    return best

# =========================
# ARCHITECTURE SOURCE VALIDATION
# =========================

def _sanitize_source_field(src):
    """
    Normaliza string ou lista de URLs.
    """
    if not src:
        return INVALID_SOURCE_TOKEN

    if isinstance(src, list):
        cleaned = []
        for s in src:
            cleaned.append(s if url_exists(s) else INVALID_SOURCE_TOKEN)
        return cleaned

    return src if url_exists(src) else INVALID_SOURCE_TOKEN


def validate_architecture_sources(plan: dict) -> dict:
    if not plan:
        return plan

    # -------------------------
    # components[].source
    # -------------------------
    comps = plan.get("architecture", {}).get("components", [])

    for c in comps:
        c["source"] = _sanitize_source_field(c.get("source"))

    # -------------------------
    # decisions[].evidence.source
    # -------------------------
    decisions = plan.get("decisions", [])

    for d in decisions:
        ev = d.get("evidence", {}) or {}
        ev["source"] = _sanitize_source_field(ev.get("source"))
        d["evidence"] = ev

    return plan


# =========================
# LOADERS
# =========================

def load_all():
    global vectorstore, retriever, graph, chain, RFP_DECISION_TEMPLATE, chain_architecture
    print("🔄 Loading FAISS + Graph + Chain...")

    try:
        vectorstore = FAISS.load_local(
            INDEX_PATH,
            embeddings,
            allow_dangerous_deserialization=True
        )

        retriever = vectorstore.as_retriever(
            search_type="similarity",
            search_kwargs={"k": 50, "fetch_k": 100}
        )
    except:
        print("No Faiss")

    RFP_DECISION_TEMPLATE = """
    You are answering an RFP requirement with risk awareness.

    Requirement:
    Type: {requirement_type}
    Subject: {subject}
    Expected value: {expected_value}

    Document evidence:
    {text_context}

    Graph evidence:
    {graph_context}

    Decision rules:
    - Answer ONLY with YES, NO or PARTIAL
    - If value differs, answer PARTIAL
    - If not found, answer NO

    Interpretation rules (MANDATORY):
    - If a capability is supported but requires reboot, downtime, or restart, it STILL counts as YES unless the requirement explicitly forbids it.
    - "Servidor em funcionamento" means the resource exists and is active before the operation, not that it must remain online without interruption.
    - Only answer NO if the operation is NOT supported at all or requires destroying and recreating the resource.
    - Reboot, restart, or brief unavailability MUST NOT be interpreted as lack of support.

    Confidence rules:
    - HIGH: Explicit evidence directly answers the requirement
    - MEDIUM: Evidence partially matches or requires light interpretation
    - LOW: Requirement is ambiguous OR evidence is indirect OR missing

    Ambiguity rules:
    - ambiguity_detected = true if:
      - The requirement can be interpreted in more than one way
      - Keywords are vague (e.g. "support", "integration", "capability")
      - Evidence does not clearly bind to subject + expected value

    Service scope rules (MANDATORY):
    - Do NOT use evidence from a different Oracle Cloud service to justify another.

    SOURCE RULE:
    - GIVE MANY SOURCES URL
    - GIVE A COMPLETE PATH OF URL SOURCES TO UNDERSTAND THE CONCEPTS THEME
        - GIVE one or more OVERVIEW SOURCE URL
        - GIVE one or more SOLUTION AND ARCHITECTURE SOURCE URL

    OUTPUT CONSTRAINTS (MANDATORY):
    - Return ONLY a valid JSON object
    - Do NOT include explanations, comments, markdown, lists, or code fences
    - Do NOT write any text before or after the JSON
    - The response must start with an opening curly brace and end with a closing curly brace

    LANGUAGE RULE (MANDATORY): **DO TRANSLATION AS THE LAST STEP**
    - Write ALL textual values in {lang}
    - Keep JSON keys in English
    - Do NOT translate keys

    JSON schema (return exactly this structure):
    {{
      "answer": "YES | NO | PARTIAL",
      "confidence": "HIGH | MEDIUM | LOW",
      "ambiguity_detected": true,
      "confidence_reason": "<short reason>",
      "justification": "<short factual explanation>",
      "evidence": [
        {{ "quote": "...",
            "source": "..." }},
        {{ "quote": "...",
            "source": "..." }},
        {{ "quote": "...",
            "source": "..." }},
      ]

    }}
    """
    prompt = PromptTemplate.from_template(RFP_DECISION_TEMPLATE)

    chain = (
            RunnableLambda(lambda q: {
                "question": q,
                "req": parse_rfp_requirement(q)
            })
            | RunnableMap({
        "text_context": lambda x: get_context_from_requirement(x["req"])["text_context"],
        "graph_context": lambda x: get_context_from_requirement(x["req"])["graph_context"],
        "requirement_type": lambda x: x["req"]["requirement_type"],
        "subject": lambda x: x["req"]["subject"],
        "expected_value": lambda x: x["req"].get("expected_value", ""),
        "lang": lambda x: normalize_lang(detect(x["question"]))
    })
            | prompt
            | llm
            | StrOutputParser()
    )

    chain_architecture = build_architecture_chain()

    print("✅ Loaded!")

def answer_question(
        question: str,
        max_attempts: int = MAX_ATTEMPTS
) -> str:

    def worker(i):
        try:
            # raw = chain.invoke(question)
            raw = call_llm(chain.invoke, question)

            parsed = safe_parse_llm_answer(raw)
            score = score_answer(parsed)

            return {
                "attempt": i,
                "raw": raw,
                "parsed": parsed,
                "score": score
            }

        except Exception as e:
            print(f"❌ Attempt {i} failed: {e}")

            return {
                "attempt": i,
                "raw": "",
                "parsed": {},
                "score": -1
            }

    results = []

    with ThreadPoolExecutor(max_workers=max_attempts) as executor:
        futures = [
            executor.submit(worker, i)
            for i in range(1, max_attempts + 1)
        ]

        for f in as_completed(futures):
            try:
                r = f.result()
                if r["score"] >= 0:
                    results.append(r)
            except Exception as e:
                print("❌ Future crashed:", e)

    # 🔥 fallback absoluto
    if not results:
        print("⚠️ All attempts failed — returning safe fallback")
        return json.dumps({
            "answer": "NO",
            "confidence": "LOW",
            "ambiguity_detected": True,
            "confidence_reason": "All LLM attempts failed",
            "justification": "",
            "evidence": []
        })

    results.sort(key=lambda r: r["score"], reverse=True)

    for r in results:
        print(
            f"⚡ Attempt {r['attempt']} | "
            f"answer={r['parsed'].get('answer')} | "
            f"confidence={r['parsed'].get('confidence')} | "
            f"score={r['score']}"
        )

    best = results[0]

    print(f"\n🏆 Selected attempt {best['attempt']} (score={best['score']})")

    #return best["raw"]
    sanitized = validate_and_sanitize_sources(best["parsed"])
    return json.dumps(sanitized)


def reload_all():
    load_all()

reload_all()

# 🚀 Run
if __name__ == "__main__":
    chat()
    #rebuild_graph_from_faiss(INDEX_PATH, reverse_resume=True)