rfp_response_automation/files/rfp_process.py

import pandas as pd
import requests
import json
from pathlib import Path
import os
import re
import logging
from config_loader import load_config
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from queue import Queue
import threading
from oci_genai_llm_graphrag_rerank_rfp import answer_question

config = load_config()

logger = logging.getLogger(__name__)
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s"
)

EXCEL_QUEUE = Queue()

# =========================
# Configurações
# =========================
API_URL = "http://127.0.0.1:" + str(config.service_port) + "/chat"
QUERY_LOG_FILE = Path("queries_with_low_confidence_or_no.txt")

CONTEXT_COLUMNS = [1, 2]   # USE IF YOU HAVE A NON-HIERARQUICAL STRUCTURE
ORDER_COLUMN = 0 # WHERE ARE YOUR ORDER LINE COLUMN
QUESTION_COLUMN = 4 # WHERE ARE YOUR QUESTION/TEXT to submit to RFP AI
ALLOWED_STRUCTURES = [
    "x.x",
    "x.x.x",
    "x.x.x.x",
    "x.x.x.x.x",
    "x.x.x.x.x.x"
]
ALLOWED_SEPARATORS = [".", "-", "/", "_", ">"]

ANSWER_COL = "ANSWER"          # NAME YOUR COLUMN for the YES/NO/PARTIAL result
JSON_COL = "RESULT_JSON"       # NAME YOUR COLUMN for the RFP AI automation results
ARCH_PLAN_COL = "ARCH_PLAN"
MERMAID_COL = "MERMAID"

CONFIDENCE_COL = "CONFIDENCE"
AMBIGUITY_COL = "AMBIGUITY"
CONF_REASON_COL = "CONFIDENCE_REASON"
JUSTIFICATION_COL = "JUSTIFICATION"

# =========================
# Helpers
# =========================

def normalize_structure(num: str, separators: list[str]) -> str:
    if not num:
        return ""

    pattern = "[" + re.escape("".join(separators)) + "]"
    return re.sub(pattern, ".", num.strip())

def should_process(num: str, allowed_patterns: list[str], separators: list[str]) -> bool:
    normalized = normalize_structure(num, separators)

    if not is_hierarchical(normalized):
        return True

    depth = normalized.count(".") + 1

    allowed_depths = {
        pattern.count(".") + 1
        for pattern in allowed_patterns
    }

    return depth in allowed_depths

def register_failed_query(query: str, answer: str, confidence: str):
    QUERY_LOG_FILE.parent.mkdir(parents=True, exist_ok=True)
    logger.info("Negative/Doubt result")
    with QUERY_LOG_FILE.open("a", encoding="utf-8") as f:
        f.write("----------------------------\n")
        f.write(f"Query:\n{query}\n\n")
        f.write(f"Answer: {answer}\n")
        f.write(f"Confidence: {confidence}\n\n")

def normalize_num(num: str) -> str:
    return num.strip().rstrip(".")

def build_question_from_columns(row, context_cols: list[int], question_col: int) -> str:
    context_parts = []

    for col in context_cols:
        value = str(row.iloc[col]).strip()
        if value:
            context_parts.append(value)

    question = str(row.iloc[question_col]).strip()

    if not context_parts:
        return question

    context = " > ".join(dict.fromkeys(context_parts))
    return f'Considering the context of "{context}", {question}'

def build_question(hierarchy: dict, current_num: str) -> str:
    if not is_hierarchical(current_num):
        return hierarchy[current_num]["text"]

    parts = current_num.split(".")

    main_subject = None
    main_key = None

    # ancestral mais alto existente
    for i in range(1, len(parts) + 1):
        key = ".".join(parts[:i])
        if key in hierarchy:
            main_subject = hierarchy[key]["text"]
            main_key = key
            break

    if not main_subject:
        raise ValueError(f"No valid root subject for {current_num}")

    subtopics = []
    for i in range(1, len(parts)):
        key = ".".join(parts[: i + 1])
        if key in hierarchy and key != main_key:
            subtopics.append(hierarchy[key]["text"])

    specific = hierarchy[current_num]["text"]

    if subtopics:
        context = " > ".join(subtopics)
        return (
            f'Considering the context of "{context}"'
        )

    return f'What is the {specific} of {main_subject}?'

def normalize_api_response(api_response) -> dict:
    # --------------------------------
    # 🔥 STRING → JSON
    # --------------------------------
    if isinstance(api_response, str):
        try:
            api_response = json.loads(api_response)
        except Exception:
            return {"error": f"Invalid string response: {api_response[:300]}"}

    if not isinstance(api_response, dict):
        return {"error": f"Invalid type: {type(api_response)}"}

    if "error" in api_response:
        return api_response

    if isinstance(api_response.get("result"), dict):
        return api_response["result"]

    if "answer" in api_response:
        return api_response

    return {"error": f"Unexpected format: {str(api_response)[:300]}"}

def call_api(
        question: str,
        *,
        api_url: str,
        timeout: int,
        auth_user: str | None,
        auth_pass: str | None,
) -> dict:

    payload = {"question": question}

    response = requests.post(
        api_url,
        json=payload,
        auth=(auth_user, auth_pass) if auth_user else None,
        timeout=timeout
    )

    if response.status_code >= 500:
        raise RuntimeError(
            f"Server error {response.status_code}: {response.text}",
            response=response
        )

    text = response.text.lower()

    if "gateway time" in text or "timeout" in text:
        raise RuntimeError(response.text)

    try:
        return response.json()
    except:
        raise RuntimeError(
            f"Invalid JSON: {response.text[:300]}"
        )

def is_explicit_url(source: str) -> bool:
    return source.startswith("http://") or source.startswith("https://")

def is_hierarchical(num: str) -> bool:
    return bool(
        num
        and "." in num
        and all(p.isdigit() for p in num.split("."))
    )

def normalize_evidence_sources(evidence: list[dict]) -> list[dict]:
    normalized = []

    for ev in evidence:
        source = ev.get("source", "").strip()
        quote = ev.get("quote", "").strip()

        if is_explicit_url(source):
            normalized.append(ev)
            continue

        normalized.append({
            "quote": quote,
            "source": source or "Oracle Cloud Infrastructure documentation"
        })

    return normalized

def build_justification_with_links(justification: str, evidence: list[dict]) -> str:
    """
    Combine justification text + evidence URLs in a readable format for Excel.
    """

    if not evidence:
        return justification or ""

    urls = []

    for ev in evidence:
        src = ev.get("source", "").strip()
        if is_explicit_url(src):
            urls.append(src)

    if not urls:
        return justification or ""

    links_text = "\n".join(f"- {u}" for u in sorted(set(urls)))

    if justification:
        return f"{justification}\n\nSources:\n{links_text}"

    return f"Sources:\n{links_text}"

def call_api_with_retry(question, max_minutes=30, **kwargs):
    start = time.time()
    attempt = 0
    delay = 5

    while True:
        try:
            return call_api(question, **kwargs)

        except Exception as e:
            attempt += 1
            elapsed = time.time() - start

            msg = str(e).lower()
            if any(x in msg for x in ["401", "403", "400", "invalid json format"]):
                raise

            if elapsed > max_minutes * 60:
                raise RuntimeError(
                    f"Timeout after {attempt} attempts / {int(elapsed)}s"
                )

            logger.info(
                f"🔁 Retry {attempt} | waiting {delay}s | {e}"
            )

            time.sleep(delay)

            delay = min(delay * 1.5, 60)

def call_local_engine(question: str) -> dict:
    return answer_question(question)

# =========================
# Main
# =========================
def process_excel_rfp(
        input_excel: Path,
        output_excel: Path,
        *,
        api_url: str,
        timeout: int = 120,
        auth_user: str | None = None,
        auth_pass: str | None = None,
) -> Path:

    df = pd.read_excel(input_excel, dtype=str).fillna("")

    for col in [
        ANSWER_COL,
        JSON_COL,
        CONFIDENCE_COL,
        AMBIGUITY_COL,
        CONF_REASON_COL,
        JUSTIFICATION_COL
    ]:
        if col not in df.columns:
            df[col] = ""

    hierarchy = {}
    for idx, row in df.iterrows():
        num = normalize_num(str(row.iloc[ORDER_COLUMN]))
        text = str(row.iloc[QUESTION_COLUMN]).strip()

        if num and text:
            hierarchy[num] = {"text": text, "row": idx}

    # =========================================
    # 🔥 WORKER PARALELO
    # =========================================
    def process_row(num, info):
        try:
            row = df.loc[info["row"]]

            if is_hierarchical(num):
                question = build_question(hierarchy, num)
            else:
                question = build_question_from_columns(
                    row,
                    CONTEXT_COLUMNS,
                    QUESTION_COLUMN
                )

            logger.info(f"\n🔸 QUESTION {num} SENT TO API:\n{question}")

            # raw = call_api_with_retry(
            #     question,
            #     api_url=api_url,
            #     timeout=timeout,
            #     auth_user=auth_user,
            #     auth_pass=auth_pass
            # )
            raw = call_local_engine(question)

            resp = normalize_api_response(raw)

            return info["row"], question, resp

        except Exception as e:
            return info["row"], "", {"error": str(e)}

    # =========================================
    # PARALLEL EXECUTION - FUTURE - OCI ACCEPTS ONLY 1 HERE
    # =========================================
    futures = []

    with ThreadPoolExecutor(max_workers=1) as executor:

        for num, info in hierarchy.items():

            if not should_process(num, ALLOWED_STRUCTURES, ALLOWED_SEPARATORS):
                continue

            futures.append(executor.submit(process_row, num, info))

        for f in as_completed(futures):

            row_idx, question, api_response = f.result()
            api_response = normalize_api_response(api_response)

            try:
                if "error" in api_response:
                    raise Exception(api_response["error"])

                if "evidence" in api_response:
                    api_response["evidence"] = normalize_evidence_sources(
                        api_response["evidence"]
                    )

                if (
                        api_response.get("answer") == "NO"
                        or api_response.get("confidence") in ("MEDIUM", "LOW")
                ):
                    register_failed_query(
                        query=question,
                        answer=api_response.get("answer", ""),
                        confidence=api_response.get("confidence", "")
                    )

                df.at[row_idx, ANSWER_COL] = api_response.get("answer", "ERROR")
                df.at[row_idx, CONFIDENCE_COL] = api_response.get("confidence", "")
                df.at[row_idx, AMBIGUITY_COL] = str(api_response.get("ambiguity_detected", ""))
                df.at[row_idx, CONF_REASON_COL] = api_response.get("confidence_reason", "")
                df.at[row_idx, JUSTIFICATION_COL] = build_justification_with_links(
                    api_response.get("justification", ""),
                    api_response.get("evidence", [])
                )
                df.at[row_idx, JSON_COL] = json.dumps(api_response, ensure_ascii=False)

                logger.info(json.dumps(api_response, indent=2))

            except Exception as e:
                df.at[row_idx, ANSWER_COL] = "ERROR"
                df.at[row_idx, CONFIDENCE_COL] = "LOW"
                df.at[row_idx, JUSTIFICATION_COL] = str(e)

                logger.info(f"❌ ERROR: {e}")

    df.to_excel(output_excel, index=False)

    return output_excel

if __name__ == "__main__":
    import sys

    input_path = Path(sys.argv[1])
    output_path = input_path.with_name(input_path.stem + "_result.xlsx")

    process_excel_rfp(
        input_excel=input_path,
        output_excel=output_path,
        api_url=API_URL,
    )