mirror of
https://github.com/hoshikawa2/rfp_response_automation.git
synced 2026-03-06 02:10:41 +00:00
3096 lines
92 KiB
Python
3096 lines
92 KiB
Python
from langchain_community.chat_models.oci_generative_ai import ChatOCIGenAI
|
||
from langchain_core.prompts import PromptTemplate
|
||
from langchain.schema.output_parser import StrOutputParser
|
||
from langchain_community.embeddings import OCIGenAIEmbeddings
|
||
from langchain_community.vectorstores import FAISS
|
||
from langchain.schema.runnable import RunnableMap
|
||
from langchain_community.document_loaders import UnstructuredPDFLoader, PyMuPDFLoader
|
||
from langchain_core.documents import Document
|
||
from langchain_core.runnables import RunnableLambda
|
||
from pathlib import Path
|
||
from tqdm import tqdm
|
||
import os
|
||
import pickle
|
||
import re
|
||
import atexit
|
||
import oracledb
|
||
import json
|
||
import base64
|
||
import hashlib
|
||
from datetime import datetime
|
||
import requests
|
||
import textwrap
|
||
import unicodedata
|
||
from typing import Optional
|
||
from collections import deque
|
||
from langchain.callbacks.base import BaseCallbackHandler
|
||
from langdetect import detect
|
||
from config_loader import load_config
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
import threading
|
||
|
||
def chunk_hash(text: str) -> str:
|
||
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
||
|
||
config = load_config()
|
||
|
||
# =========================
|
||
# Oracle Autonomous Configuration
|
||
# =========================
|
||
WALLET_PATH = config.wallet_path
|
||
DB_ALIAS = config.db_alias
|
||
USERNAME = config.username
|
||
PASSWORD = config.password
|
||
os.environ["TNS_ADMIN"] = WALLET_PATH
|
||
|
||
# =========================
|
||
# Global Configurations
|
||
# =========================
|
||
INDEX_PATH = config.index_path
|
||
PROCESSED_DOCS_FILE = os.path.join(INDEX_PATH, "processed_docs.pkl")
|
||
chapter_separator_regex = r"^(#{1,6} .+|\*\*.+\*\*)$"
|
||
GRAPH_NAME = config.graph_name
|
||
LOG_BUFFER = deque(maxlen=500)
|
||
MAX_ATTEMPTS = 3
|
||
GENAI_MAX_CONCURRENT = 1000
|
||
GENAI_SEMAPHORE = threading.Semaphore(GENAI_MAX_CONCURRENT)
|
||
|
||
def call_llm(fn, *args, **kwargs):
|
||
with GENAI_SEMAPHORE:
|
||
return fn(*args, **kwargs)
|
||
|
||
# =========================
|
||
# LLM Definitions
|
||
# =========================
|
||
|
||
llm = ChatOCIGenAI(
|
||
model_id=config.llm_model,
|
||
service_endpoint=config.service_endpoint,
|
||
compartment_id=config.compartment_id,
|
||
auth_profile=config.auth_profile,
|
||
model_kwargs={"temperature": 0, "top_p": 1, "max_tokens": 4000},
|
||
)
|
||
|
||
llm_for_rag = ChatOCIGenAI(
|
||
model_id=config.llm_model,
|
||
service_endpoint=config.service_endpoint,
|
||
compartment_id=config.compartment_id,
|
||
auth_profile=config.auth_profile,
|
||
)
|
||
|
||
lrm_for_architect = ChatOCIGenAI(
|
||
model_id=config.llm_model,
|
||
service_endpoint=config.service_endpoint,
|
||
compartment_id=config.compartment_id,
|
||
auth_profile=config.auth_profile,
|
||
)
|
||
|
||
embeddings = OCIGenAIEmbeddings(
|
||
model_id=config.embedding_model,
|
||
service_endpoint=config.service_endpoint,
|
||
compartment_id=config.compartment_id,
|
||
auth_profile=config.auth_profile,
|
||
)
|
||
|
||
oracle_conn = oracledb.connect(
|
||
user=USERNAME,
|
||
password=PASSWORD,
|
||
dsn=DB_ALIAS,
|
||
config_dir=WALLET_PATH,
|
||
wallet_location=WALLET_PATH,
|
||
wallet_password=PASSWORD
|
||
)
|
||
atexit.register(lambda: oracle_conn.close())
|
||
|
||
def filename_to_url(filename: str, suffix: str = ".pdf") -> str:
|
||
if filename.endswith(suffix):
|
||
filename = filename[: -len(suffix)]
|
||
decoded = base64.urlsafe_b64decode(filename.encode("ascii"))
|
||
return decoded.decode("utf-8")
|
||
|
||
def default_logger(msg):
|
||
print(msg)
|
||
|
||
# =========================================
|
||
# ARCHITECTURE-SPECIFIC SOURCE RANKING
|
||
# =========================================
|
||
|
||
class BrowserLogCallback(BaseCallbackHandler):
|
||
|
||
def __init__(self, logger):
|
||
self.log = logger
|
||
|
||
# ---------- CHAIN ----------
|
||
def on_chain_start(self, serialized, inputs, **kwargs):
|
||
self.log("🔵 Chain started")
|
||
|
||
def on_chain_end(self, outputs, **kwargs):
|
||
self.log("🟢 Chain finished")
|
||
|
||
# ---------- LLM ----------
|
||
def on_llm_start(self, serialized, prompts, **kwargs):
|
||
self.log("🤖 LLM call started")
|
||
self.log(f"📏 Prompt size: {len(prompts[0])} chars")
|
||
|
||
def on_llm_end(self, response, **kwargs):
|
||
self.log("✅ LLM response received")
|
||
|
||
# ---------- RETRIEVER ----------
|
||
def on_retriever_start(self, serialized, query, **kwargs):
|
||
self.log(f"🔍 Searching vector store: {query}")
|
||
|
||
def on_retriever_end(self, documents, **kwargs):
|
||
self.log(f"📚 Retrieved {len(documents)} chunks")
|
||
|
||
# ---------- ERRORS ----------
|
||
def on_chain_error(self, error, **kwargs):
|
||
self.log(f"❌ ERROR: {error}")
|
||
|
||
ARCH_GOOD_HINTS = [
|
||
"overview",
|
||
"concept",
|
||
"architecture",
|
||
"service",
|
||
"use-case"
|
||
]
|
||
|
||
ARCH_BAD_HINTS = [
|
||
"home",
|
||
"index",
|
||
"portal",
|
||
"release-notes",
|
||
"troubleshoot",
|
||
"known-issues"
|
||
]
|
||
|
||
|
||
def score_arch_url(url: str) -> int:
|
||
if not url:
|
||
return 0
|
||
|
||
u = url.lower()
|
||
score = 0
|
||
|
||
for g in ARCH_GOOD_HINTS:
|
||
if g in u:
|
||
score += 3
|
||
|
||
for b in ARCH_BAD_HINTS:
|
||
if b in u:
|
||
score -= 5
|
||
|
||
if "docs.oracle.com" in u:
|
||
score += 2
|
||
|
||
return score
|
||
|
||
|
||
def resolve_arch_source(doc):
|
||
"""
|
||
Igual resolve_chunk_source, mas com ranking específico para arquitetura.
|
||
NÃO afeta o resto do pipeline.
|
||
"""
|
||
text = doc.page_content or ""
|
||
md = doc.metadata or {}
|
||
|
||
candidates = []
|
||
|
||
candidates += URL_REGEX.findall(text)
|
||
|
||
if md.get("reference"):
|
||
candidates.append(md["reference"])
|
||
|
||
if md.get("source"):
|
||
candidates.append(md["source"])
|
||
|
||
if not candidates:
|
||
return "Oracle Cloud Infrastructure documentation"
|
||
|
||
candidates = list(set(candidates))
|
||
candidates.sort(key=score_arch_url, reverse=True)
|
||
|
||
return candidates[0]
|
||
|
||
def strip_accents(s: str) -> str:
|
||
return ''.join(
|
||
c for c in unicodedata.normalize('NFD', s)
|
||
if unicodedata.category(c) != 'Mn'
|
||
)
|
||
|
||
def normalize_lang(code: str) -> str:
|
||
mapping = {
|
||
"pt": "Portuguese",
|
||
"en": "English",
|
||
"es": "Spanish",
|
||
"fr": "French",
|
||
"de": "German",
|
||
"it": "Italian"
|
||
}
|
||
return mapping.get(code, "English")
|
||
|
||
# =========================
|
||
# SOURCE VALIDATION (POST LLM)
|
||
# =========================
|
||
|
||
INVALID_SOURCE_TOKEN = "---------"
|
||
URL_TIMEOUT = 3
|
||
|
||
_url_cache = {}
|
||
|
||
def url_exists(url: str) -> bool:
|
||
|
||
if not url or not url.startswith("http"):
|
||
return False
|
||
|
||
if url in _url_cache:
|
||
return _url_cache[url]
|
||
|
||
try:
|
||
r = requests.get(
|
||
url,
|
||
timeout=URL_TIMEOUT,
|
||
allow_redirects=True,
|
||
headers={"User-Agent": "Mozilla/5.0"}
|
||
)
|
||
|
||
if r.status_code >= 400:
|
||
_url_cache[url] = False
|
||
return False
|
||
|
||
html = (r.text or "").lower()
|
||
|
||
# ====================================
|
||
# 🔥 ORACLE SOFT-404 TEMPLATE DETECTION
|
||
# ====================================
|
||
soft_404_patterns = [
|
||
"<title>page not found",
|
||
"that page is not available",
|
||
"class=\"page-not-found\"",
|
||
"redwood-light-404.css",
|
||
"error-container"
|
||
]
|
||
|
||
if any(p in html for p in soft_404_patterns):
|
||
_url_cache[url] = False
|
||
return False
|
||
|
||
# ====================================
|
||
# 🔥 conteúdo mínimo REAL (sem menu)
|
||
# ====================================
|
||
|
||
from bs4 import BeautifulSoup
|
||
|
||
soup = BeautifulSoup(html, "html.parser")
|
||
|
||
for tag in soup(["script", "style", "nav", "footer", "header"]):
|
||
tag.decompose()
|
||
|
||
visible_text = soup.get_text(" ", strip=True)
|
||
|
||
# se depois de remover layout quase não sobra texto => stub
|
||
# if len(visible_text) < 300:
|
||
# _url_cache[url] = False
|
||
# return False
|
||
|
||
_url_cache[url] = True
|
||
return True
|
||
|
||
except:
|
||
_url_cache[url] = False
|
||
return False
|
||
|
||
def validate_and_sanitize_sources(answer: dict) -> dict:
|
||
if not isinstance(answer, dict):
|
||
return {"answer": "NO", "confidence": "LOW", "ambiguity_detected": True,
|
||
"confidence_reason": "Invalid answer type", "justification": "", "evidence": []}
|
||
|
||
# cópia shallow do topo + cópia da lista evidence
|
||
out = dict(answer)
|
||
evidences = out.get("evidence", [])
|
||
|
||
if not isinstance(evidences, list):
|
||
return out
|
||
|
||
new_evidences = []
|
||
for ev in evidences:
|
||
if not isinstance(ev, dict):
|
||
new_evidences.append(ev)
|
||
continue
|
||
|
||
ev2 = dict(ev)
|
||
src = ev2.get("source")
|
||
|
||
if isinstance(src, list):
|
||
ev2["source"] = [
|
||
s if (isinstance(s, str) and url_exists(s)) else INVALID_SOURCE_TOKEN
|
||
for s in src
|
||
]
|
||
else:
|
||
ev2["source"] = (
|
||
src if (isinstance(src, str) and url_exists(src)) else INVALID_SOURCE_TOKEN
|
||
)
|
||
|
||
if ev2["source"] == INVALID_SOURCE_TOKEN:
|
||
print(src)
|
||
|
||
new_evidences.append(ev2)
|
||
|
||
out["evidence"] = new_evidences
|
||
return out
|
||
|
||
# =========================
|
||
# LRM Definitions
|
||
# =========================
|
||
def build_architecture_evidence(docs, max_chunks=30):
|
||
ranked = sorted(
|
||
docs,
|
||
key=lambda d: score_arch_url(resolve_arch_source(d)),
|
||
reverse=True
|
||
)
|
||
|
||
evidence = []
|
||
|
||
for d in ranked[:max_chunks]:
|
||
quote = d.page_content[:3000]
|
||
quote = re.sub(r"Reference:\s*\S+", "", quote)
|
||
|
||
evidence.append({
|
||
"quote": quote,
|
||
"source": resolve_arch_source(d)
|
||
})
|
||
|
||
return evidence
|
||
|
||
def enforce_architecture_sources(plan: dict, evidence: list[dict]) -> dict:
|
||
if not evidence:
|
||
return plan
|
||
|
||
ev_list = [e for e in evidence if e.get("source")]
|
||
valid_sources = {
|
||
str(e.get("source", ""))
|
||
for e in ev_list
|
||
if e.get("source")
|
||
}
|
||
|
||
def pick_best_source(service: str) -> dict | None:
|
||
if not service:
|
||
return None
|
||
|
||
s = service.lower()
|
||
|
||
best = None
|
||
best_score = -1
|
||
|
||
service_terms = [t for t in re.findall(r"[a-z0-9]+", s) if len(t) >= 3]
|
||
|
||
for e in ev_list:
|
||
hay = (e.get("quote", "") + " " + e.get("source", "")).lower()
|
||
score = sum(1 for t in service_terms if t in hay)
|
||
|
||
if score > best_score:
|
||
best_score = score
|
||
best = e
|
||
|
||
return best if best_score > 0 else None
|
||
|
||
for d in plan.get("decisions", []):
|
||
service = d.get("service", "")
|
||
ev = d.get("evidence", {}) or {}
|
||
|
||
if ev.get("source") in valid_sources:
|
||
continue
|
||
|
||
best_ev = pick_best_source(service)
|
||
if best_ev:
|
||
d["evidence"] = {
|
||
"quote": best_ev.get("quote", ev.get("quote", "")),
|
||
"source": best_ev["source"],
|
||
}
|
||
continue
|
||
|
||
if not ev.get("source"):
|
||
d["evidence"] = {
|
||
"quote": ev.get("quote", ""),
|
||
"source": ev_list[0]["source"],
|
||
}
|
||
|
||
return plan
|
||
|
||
def build_architecture_chain():
|
||
|
||
ARCH_PROMPT = PromptTemplate.from_template("""
|
||
You are a senior OCI Cloud Architect.
|
||
|
||
TASK:
|
||
Design an OCI architecture for:
|
||
|
||
{question}
|
||
|
||
You MUST design the solution using the provided documentation evidence.
|
||
|
||
**DOCUMENT EVIDENCE** (JSON):
|
||
{text_context}
|
||
|
||
**GRAPH FACTS**:
|
||
{graph_context}
|
||
|
||
Rules (MANDATORY):
|
||
- Use ONLY services supported by **DOCUMENT EVIDENCE**.
|
||
- The architecture may involve MULTIPLE OCI services; therefore decisions may require DIFFERENT sources (do not reuse a single URL for everything unless it truly applies).
|
||
- Set each service from this OCI Services:
|
||
Oracle Cloud Infrastructure Service (OCI):
|
||
|
||
Compute (IaaS)
|
||
• Compute Instances (VM)
|
||
• Bare Metal Instances
|
||
• Dedicated VM Hosts
|
||
• GPU Instances
|
||
• Confidential Computing
|
||
• Capacity Reservations
|
||
• Autoscaling (Instance Pools)
|
||
• Live Migration
|
||
• Oracle Cloud VMware Solution (OCVS)
|
||
• HPC (High Performance Computing)
|
||
• Arm-based Compute (Ampere)
|
||
|
||
Storage
|
||
|
||
Object Storage
|
||
• Object Storage
|
||
• Object Storage – Archive
|
||
• Pre-Authenticated Requests
|
||
• Replication
|
||
|
||
Block & File
|
||
• Block Volume
|
||
• Boot Volume
|
||
• Volume Groups
|
||
• File Storage
|
||
• File Storage Snapshots
|
||
• Data Transfer Service
|
||
|
||
Networking
|
||
• Virtual Cloud Network (VCN)
|
||
• Subnets
|
||
• Internet Gateway
|
||
• NAT Gateway
|
||
• Service Gateway
|
||
• Dynamic Routing Gateway (DRG)
|
||
• FastConnect
|
||
• Load Balancer (L7 / L4)
|
||
• Network Load Balancer
|
||
• DNS
|
||
• Traffic Management Steering Policies
|
||
• IP Address Management (IPAM)
|
||
• Network Firewall
|
||
• Web Application Firewall (WAF)
|
||
• Bastion
|
||
• Capture Traffic (VTAP)
|
||
• Private Endpoints
|
||
|
||
Security, Identity & Compliance
|
||
• Identity and Access Management (IAM)
|
||
• Compartments
|
||
• Policies
|
||
• OCI Vault
|
||
• OCI Key Management (KMS)
|
||
• OCI Certificates
|
||
• OCI Secrets
|
||
• OCI Bastion
|
||
• Cloud Guard
|
||
• Security Zones
|
||
• Vulnerability Scanning Service
|
||
• Data Safe
|
||
• Logging
|
||
• Audit
|
||
• OS Management / OS Management Hub
|
||
• Shielded Instances
|
||
• Zero Trust Packet Routing
|
||
|
||
Databases
|
||
|
||
Autonomous
|
||
• Autonomous Database (ATP)
|
||
• Autonomous Data Warehouse (ADW)
|
||
• Autonomous JSON Database
|
||
|
||
Databases Gerenciados
|
||
• Oracle Database Service
|
||
• Oracle Exadata Database Service
|
||
• Exadata Cloud@Customer
|
||
• Base Database Service
|
||
• MySQL Database Service
|
||
• MySQL HeatWave
|
||
• NoSQL Database Cloud Service
|
||
• TimesTen
|
||
• PostgreSQL (OCI managed)
|
||
• MongoDB API (OCI NoSQL compatibility)
|
||
|
||
Analytics & BI
|
||
• Oracle Analytics Cloud (OAC)
|
||
• OCI Data Catalog
|
||
• OCI Data Integration
|
||
• OCI Streaming Analytics
|
||
• OCI GoldenGate
|
||
• OCI Big Data Service (Hadoop/Spark)
|
||
• OCI Data Science
|
||
• OCI AI Anomaly Detection
|
||
• OCI AI Forecasting
|
||
|
||
AI & Machine Learning
|
||
|
||
Generative AI
|
||
• OCI Generative AI
|
||
• OCI Generative AI Agents
|
||
• OCI Generative AI RAG
|
||
• OCI Generative AI Embeddings
|
||
• OCI AI Gateway (OpenAI-compatible)
|
||
|
||
AI Services
|
||
• OCI Vision (OCR, image analysis)
|
||
• OCI Speech (STT / TTS)
|
||
• OCI Language (NLP)
|
||
• OCI Document Understanding
|
||
• OCI Anomaly Detection
|
||
• OCI Forecasting
|
||
• OCI Data Labeling
|
||
|
||
Containers & Cloud Native
|
||
• OCI Container Engine for Kubernetes (OKE)
|
||
• Container Registry (OCIR)
|
||
• Service Mesh
|
||
• API Gateway
|
||
• OCI Functions (FaaS)
|
||
• OCI Streaming (Kafka-compatible)
|
||
• OCI Queue
|
||
• OCI Events
|
||
• OCI Resource Manager (Terraform)
|
||
|
||
Integration & Messaging
|
||
• OCI Integration Cloud (OIC)
|
||
• OCI Service Connector Hub
|
||
• OCI Streaming
|
||
• OCI GoldenGate
|
||
• OCI API Gateway
|
||
• OCI Events Service
|
||
• OCI Queue
|
||
• Real Applications Clusters (RAC)
|
||
|
||
Developer Services
|
||
• OCI DevOps (CI/CD)
|
||
• OCI Code Repository
|
||
• OCI Build Pipelines
|
||
• OCI Artifact Registry
|
||
• OCI Logging Analytics
|
||
• OCI Monitoring
|
||
• OCI Notifications
|
||
• OCI Bastion
|
||
• OCI CLI
|
||
• OCI SDKs
|
||
|
||
Observability & Management
|
||
• OCI Monitoring
|
||
• OCI Alarms
|
||
• OCI Logging
|
||
• OCI Logging Analytics
|
||
• OCI Application Performance Monitoring (APM)
|
||
• OCI Operations Insights
|
||
• OCI Management Agent
|
||
• OCI Resource Discovery
|
||
|
||
Enterprise & Hybrid
|
||
• Oracle Cloud@Customer
|
||
• Exadata Cloud@Customer
|
||
• Compute Cloud@Customer
|
||
• Dedicated Region Cloud@Customer
|
||
• OCI Roving Edge Infrastructure
|
||
• OCI Alloy
|
||
|
||
Governance & FinOps
|
||
• OCI Budgets
|
||
• Cost Analysis
|
||
• Usage Reports
|
||
• Quotas
|
||
• Tagging
|
||
• Compartments
|
||
• Resource Search
|
||
|
||
Regions & Edge
|
||
• OCI Regions (Commercial, Government, EU Sovereign)
|
||
• OCI Edge Services
|
||
• OCI Roving Edge
|
||
• OCI Dedicated Region
|
||
|
||
STRICT SERVICE GROUNDING RULE (MANDATORY):
|
||
- For each decision, use evidence from the SAME service_group as the decision service.
|
||
- Do NOT justify one service using evidence from another service's documentation.
|
||
|
||
SOURCE RULES (STRICT):
|
||
- Copy ONLY URLs that appear EXACTLY in DOCUMENT EVIDENCE or GRAPH FACTS
|
||
- NEVER create or guess URLs
|
||
- If no URL is explicitly present, set source = null
|
||
- It is allowed to return null
|
||
- GIVE MANY SOURCES URL
|
||
- GIVE A COMPLETE PATH OF URL SOURCES TO UNDERSTAND THE CONCEPTS THEME
|
||
- GIVE one or more OVERVIEW SOURCE URL
|
||
- GIVE one or more SOLUTION AND ARCHITECTURE SOURCE URL
|
||
|
||
MANDATORY:
|
||
- Break into requirements
|
||
- Map each requirement to OCI services
|
||
- Justify each choice
|
||
|
||
LANGUAGE RULE (MANDATORY): **DO TRANSLATION AS THE LAST STEP**
|
||
- Write ALL textual values in {lang}
|
||
- Keep JSON keys in English
|
||
- Do NOT translate keys
|
||
|
||
Return ONLY JSON:
|
||
|
||
{{
|
||
"problem_summary": "...",
|
||
"architecture": {{
|
||
"components": [
|
||
{{
|
||
"id": "api",
|
||
"service": "OCI API Gateway",
|
||
"purpose": "...",
|
||
"source": ["__AUTO__"],
|
||
"connects_to": []
|
||
}}
|
||
]
|
||
}},
|
||
"decisions": [
|
||
{{
|
||
"service": "...",
|
||
"reason": "must cite evidence",
|
||
"evidence": {{
|
||
"quote": "...",
|
||
"source": ["__AUTO__"]
|
||
}}
|
||
}}
|
||
]
|
||
}}
|
||
""")
|
||
|
||
callback = BrowserLogCallback(default_logger)
|
||
|
||
chain_arch = (
|
||
RunnableLambda(lambda q: {
|
||
"question": q,
|
||
"req": parse_rfp_requirement(q)
|
||
})
|
||
| RunnableMap({
|
||
"question": lambda x: x["question"],
|
||
"text_context": lambda x: get_architecture_context(x["req"])["text_context"],
|
||
"graph_context": lambda x: get_architecture_context(x["req"])["graph_context"],
|
||
"lang": lambda x: normalize_lang(detect(x["question"]))
|
||
})
|
||
| ARCH_PROMPT
|
||
| lrm_for_architect
|
||
| StrOutputParser()
|
||
).with_config(callbacks=[callback])
|
||
|
||
return chain_arch
|
||
|
||
def score_url_quality(url: str) -> int:
|
||
if not url:
|
||
return 0
|
||
|
||
u = url.lower()
|
||
score = 0
|
||
|
||
if "/solutions/" in u:
|
||
score += 8
|
||
|
||
elif "youtube.com" in u or "youtu.be" in u:
|
||
score += 7
|
||
|
||
elif any(x in u for x in [
|
||
"architecture", "overview", "concept", "how-to", "use-case"
|
||
]):
|
||
score += 5
|
||
|
||
elif "docs.oracle.com" in u:
|
||
score += 3
|
||
|
||
if any(x in u for x in [
|
||
"home", "index", "portal", "release-notes", "faq", "troubleshoot"
|
||
]):
|
||
score -= 10
|
||
|
||
return score
|
||
|
||
|
||
def score_architecture_plan(plan: dict) -> int:
|
||
if not plan:
|
||
return -1
|
||
|
||
score = 0
|
||
|
||
comps = plan.get("architecture", {}).get("components", [])
|
||
decisions = plan.get("decisions", [])
|
||
|
||
score += len(comps) * 3
|
||
score += len(decisions) * 4
|
||
|
||
for d in decisions:
|
||
ev = d.get("evidence", {}) or {}
|
||
srcs = ev.get("source", [])
|
||
|
||
if isinstance(srcs, str):
|
||
srcs = [srcs]
|
||
|
||
for s in srcs:
|
||
score += score_url_quality(s)
|
||
|
||
quote = ""
|
||
if ev.get("quote", ""):
|
||
quote = ev.get("quote", "")
|
||
|
||
score += min(len(quote) // 500, 4)
|
||
|
||
return score
|
||
|
||
def call_architecture_planner(
|
||
question: str,
|
||
parallel_attempts: int = MAX_ATTEMPTS,
|
||
log=default_logger
|
||
):
|
||
log("\n🏗️ ARCHITECTURE (PARALLEL SELF-CONSISTENCY)")
|
||
|
||
def worker(i):
|
||
print(0)
|
||
# raw = chain_architecture.invoke(question)
|
||
raw = call_llm(chain_architecture.invoke, question)
|
||
print(1)
|
||
plan = safe_parse_architecture_json(raw)
|
||
print(2)
|
||
|
||
try:
|
||
score = score_architecture_plan(plan)
|
||
except:
|
||
print("Error scoring", plan)
|
||
score = 0
|
||
print(3)
|
||
|
||
return {
|
||
"attempt": i,
|
||
"plan": plan,
|
||
"score": score
|
||
}
|
||
|
||
results = []
|
||
|
||
with ThreadPoolExecutor(max_workers=parallel_attempts) as executor:
|
||
futures = [
|
||
executor.submit(worker, i)
|
||
for i in range(1, parallel_attempts + 1)
|
||
]
|
||
|
||
for f in as_completed(futures):
|
||
results.append(f.result())
|
||
|
||
results.sort(key=lambda r: r["score"], reverse=True)
|
||
|
||
for r in results:
|
||
log(f"⚡ Attempt {r['attempt']} score={r['score']}")
|
||
|
||
best = results[0]["plan"]
|
||
|
||
log(f"\n🏆 Selected architecture from attempt {results[0]['attempt']}")
|
||
|
||
|
||
plan = safe_parse_architecture_json(best)
|
||
plan = validate_architecture_sources(plan)
|
||
|
||
return plan
|
||
|
||
def architecture_to_mermaid(plan: dict) -> str:
|
||
|
||
architecture = plan.get("architecture", {})
|
||
comps = architecture.get("components", [])
|
||
|
||
if not comps:
|
||
return "flowchart TB\nEmpty[No components]"
|
||
|
||
direction = "TB" if len(comps) > 6 else "LR"
|
||
|
||
lines = [f"flowchart {direction}"]
|
||
|
||
# nodes
|
||
for c in comps:
|
||
cid = c["id"]
|
||
purpose = "\n".join(textwrap.wrap(c["purpose"], 28))
|
||
label = f'{c["service"]}\\n{purpose}'
|
||
lines.append(f'{cid}["{label}"]')
|
||
|
||
# edges
|
||
for c in comps:
|
||
src = c["id"]
|
||
|
||
for target in c.get("connects_to", []):
|
||
|
||
if isinstance(target, dict):
|
||
dst = target.get("id")
|
||
|
||
elif isinstance(target, str):
|
||
dst = target
|
||
|
||
else:
|
||
continue
|
||
|
||
if dst:
|
||
lines.append(f"{src} --> {dst}")
|
||
|
||
return "\n".join(lines)
|
||
|
||
def get_architecture_context(req: dict):
|
||
query_terms = extract_graph_keywords_from_requirement(req)
|
||
|
||
docs = search_active_chunks(query_terms)
|
||
graph_context = query_knowledge_graph(query_terms, top_k=20, min_score=1)
|
||
|
||
graph_terms = extract_terms_from_graph_text(graph_context)
|
||
reranked_chunks = rerank_documents_with_graph_terms(
|
||
docs,
|
||
query_terms,
|
||
graph_terms,
|
||
top_k=8
|
||
)
|
||
structured_evidence = build_architecture_evidence(reranked_chunks)
|
||
|
||
return {
|
||
"text_context": structured_evidence,
|
||
"graph_context": graph_context,
|
||
"requirement_type": req["requirement_type"],
|
||
"subject": req["subject"],
|
||
"expected_value": req.get("expected_value", "")
|
||
}
|
||
|
||
def extract_first_balanced_json(text: str) -> str | None:
|
||
"""
|
||
Extrai APENAS o primeiro JSON balanceado.
|
||
Ignora QUALQUER coisa depois.
|
||
"""
|
||
|
||
start = text.find("{")
|
||
if start == -1:
|
||
return None
|
||
|
||
depth = 0
|
||
|
||
for i, c in enumerate(text[start:], start):
|
||
if c == "{":
|
||
depth += 1
|
||
elif c == "}":
|
||
depth -= 1
|
||
|
||
if depth == 0:
|
||
return text[start:i+1]
|
||
|
||
return None
|
||
|
||
def sanitize_json_string(text: str) -> str:
|
||
return re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', text)
|
||
|
||
|
||
def recover_json_object(text: str) -> str | None:
|
||
"""
|
||
Extrai o primeiro objeto JSON possível.
|
||
- ignora lixo antes/depois
|
||
- tolera truncamento
|
||
- fecha chaves automaticamente
|
||
"""
|
||
|
||
if not text:
|
||
return None
|
||
|
||
# remove markdown fences
|
||
text = text.replace("```json", "").replace("```", "")
|
||
|
||
start = text.find("{")
|
||
if start == -1:
|
||
return None
|
||
|
||
text = text[start:]
|
||
|
||
depth = 0
|
||
end_pos = None
|
||
|
||
for i, c in enumerate(text):
|
||
if c == "{":
|
||
depth += 1
|
||
elif c == "}":
|
||
depth -= 1
|
||
|
||
if depth == 0:
|
||
end_pos = i + 1
|
||
break
|
||
|
||
# ✅ caso normal → JSON completo
|
||
if end_pos:
|
||
return text[:end_pos]
|
||
|
||
# 🔥 caso TRUNCADO → fecha automaticamente
|
||
opens = text.count("{")
|
||
closes = text.count("}")
|
||
|
||
missing = opens - closes
|
||
|
||
if missing > 0:
|
||
text = text + ("}" * missing)
|
||
|
||
text = sanitize_json_string(text)
|
||
text = extract_first_balanced_json(text)
|
||
|
||
return text
|
||
|
||
def safe_parse_architecture_json(raw):
|
||
"""
|
||
Robust JSON parser for LLM output.
|
||
Remove control chars + fix invalid escapes.
|
||
"""
|
||
|
||
if isinstance(raw, dict):
|
||
return raw
|
||
|
||
if not raw:
|
||
return {}
|
||
|
||
raw = raw.replace("```json", "").replace("```", "").strip()
|
||
|
||
# pega só o bloco JSON
|
||
m = re.search(r"\{.*\}", raw, re.DOTALL)
|
||
if not m:
|
||
raise ValueError(f"Invalid architecture JSON:\n{raw}")
|
||
|
||
json_text = m.group(0)
|
||
|
||
# 🔥 remove caracteres de controle invisíveis
|
||
json_text = re.sub(r"[\x00-\x1F\x7F]", " ", json_text)
|
||
|
||
# 🔥 normaliza newlines
|
||
json_text = json_text.replace("\r", " ").replace("\n", " ")
|
||
|
||
try:
|
||
return json.loads(json_text)
|
||
except Exception as e:
|
||
print(e)
|
||
print("⚠️ JSON RAW (debug):")
|
||
print(json_text)
|
||
try:
|
||
return json.loads(recover_json_object(json_text))
|
||
except Exception as e1:
|
||
print(e1)
|
||
raise e1
|
||
|
||
# =========================
|
||
# Oracle Graph Client
|
||
# =========================
|
||
ANSWER_RANK = {
|
||
"YES": 3,
|
||
"PARTIAL": 2,
|
||
"NO": 1
|
||
}
|
||
|
||
CONFIDENCE_RANK = {
|
||
"HIGH": 3,
|
||
"MEDIUM": 2,
|
||
"LOW": 1
|
||
}
|
||
|
||
def score_answer(parsed: dict) -> int:
|
||
ans = parsed.get("answer", "NO")
|
||
conf = parsed.get("confidence", "LOW")
|
||
evidence = parsed.get("evidence", [])
|
||
|
||
# 🔹 base lógica (já existia)
|
||
base = ANSWER_RANK.get(ans, 0) * 10 + CONFIDENCE_RANK.get(conf, 0)
|
||
|
||
unique_sources = set()
|
||
quote_size = 0
|
||
source_quality = 0
|
||
|
||
for e in evidence:
|
||
src = (e.get("source") or "").lower()
|
||
quote = e.get("quote", "")
|
||
|
||
if not src:
|
||
continue
|
||
|
||
unique_sources.add(src)
|
||
quote_size = 0
|
||
if quote:
|
||
quote_size += len(quote)
|
||
|
||
# 🔥 pesos inteligentes por qualidade
|
||
if "/solutions/" in src:
|
||
source_quality += 6
|
||
|
||
elif "youtube.com" in src:
|
||
source_quality += 5
|
||
|
||
elif any(x in src for x in ["architecture", "overview", "concepts", "how-to"]):
|
||
source_quality += 4
|
||
|
||
elif "docs.oracle.com" in src:
|
||
source_quality += 2
|
||
|
||
elif any(x in src for x in ["home", "index", "portal"]):
|
||
source_quality -= 5
|
||
|
||
evidence_score = (
|
||
len(unique_sources) * 3 +
|
||
min(quote_size // 500, 5) +
|
||
source_quality
|
||
)
|
||
|
||
return base + evidence_score
|
||
|
||
def ensure_oracle_text_index(
|
||
conn,
|
||
table_name: str,
|
||
column_name: str,
|
||
index_name: str
|
||
):
|
||
cursor = conn.cursor()
|
||
|
||
cursor.execute("""
|
||
SELECT status
|
||
FROM user_indexes
|
||
WHERE index_name = :idx
|
||
""", {"idx": index_name.upper()})
|
||
|
||
row = cursor.fetchone()
|
||
index_exists = row is not None
|
||
index_status = row[0] if row else None
|
||
|
||
if not index_exists:
|
||
print(f"🛠️ Creating Oracle Text index {index_name}")
|
||
|
||
cursor.execute(f"""
|
||
CREATE INDEX {index_name}
|
||
ON {table_name} ({column_name})
|
||
INDEXTYPE IS CTXSYS.CONTEXT
|
||
""")
|
||
|
||
conn.commit()
|
||
cursor.close()
|
||
print(f"✅ Index {index_name} created (sync deferred)")
|
||
return
|
||
|
||
if index_status != "VALID":
|
||
print(f"⚠️ Index {index_name} is {index_status}. Recreating...")
|
||
|
||
try:
|
||
cursor.execute(f"DROP INDEX {index_name}")
|
||
conn.commit()
|
||
except Exception as e:
|
||
print(f"❌ Failed to drop index {index_name}: {e}")
|
||
cursor.close()
|
||
return
|
||
|
||
cursor.execute(f"""
|
||
CREATE INDEX {index_name}
|
||
ON {table_name} ({column_name})
|
||
INDEXTYPE IS CTXSYS.CONTEXT
|
||
""")
|
||
conn.commit()
|
||
cursor.close()
|
||
print(f"♻️ Index {index_name} recreated (sync deferred)")
|
||
return
|
||
|
||
print(f"🔄 Syncing Oracle Text index: {index_name}")
|
||
try:
|
||
cursor.execute(f"""
|
||
BEGIN
|
||
CTX_DDL.SYNC_INDEX('{index_name}', '2M');
|
||
END;
|
||
""")
|
||
conn.commit()
|
||
print(f"✅ Index {index_name} synced")
|
||
except Exception as e:
|
||
print(f"⚠️ Sync failed for {index_name}: {e}")
|
||
print("⚠️ Continuing without breaking pipeline")
|
||
|
||
cursor.close()
|
||
|
||
def _col_exists(conn, table: str, col: str) -> bool:
|
||
cur = conn.cursor()
|
||
cur.execute("""
|
||
SELECT 1
|
||
FROM user_tab_cols
|
||
WHERE table_name = :t
|
||
AND column_name = :c
|
||
""", {"t": table.upper(), "c": col.upper()})
|
||
ok = cur.fetchone() is not None
|
||
cur.close()
|
||
return ok
|
||
|
||
|
||
def create_tables_if_not_exist(conn):
|
||
cursor = conn.cursor()
|
||
|
||
try:
|
||
# ---------------------------
|
||
# KG_NODES
|
||
# ---------------------------
|
||
cursor.execute(f"""
|
||
BEGIN
|
||
EXECUTE IMMEDIATE '
|
||
CREATE TABLE KG_NODES_{GRAPH_NAME} (
|
||
ID NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY,
|
||
NODE_TYPE VARCHAR2(100),
|
||
NAME VARCHAR2(1000),
|
||
DESCRIPTION CLOB,
|
||
PROPERTIES CLOB,
|
||
CREATED_AT TIMESTAMP DEFAULT SYSTIMESTAMP
|
||
)
|
||
';
|
||
EXCEPTION WHEN OTHERS THEN
|
||
IF SQLCODE != -955 THEN RAISE; END IF;
|
||
END;
|
||
""")
|
||
|
||
# ---------------------------
|
||
# KG_EDGES (estrutura + ponte pro chunk)
|
||
# ---------------------------
|
||
cursor.execute(f"""
|
||
BEGIN
|
||
EXECUTE IMMEDIATE '
|
||
CREATE TABLE KG_EDGES_{GRAPH_NAME} (
|
||
ID NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY,
|
||
SOURCE_ID NUMBER,
|
||
TARGET_ID NUMBER,
|
||
EDGE_TYPE VARCHAR2(100),
|
||
|
||
-- ✅ governança / revogação
|
||
CHUNK_HASH VARCHAR2(64),
|
||
|
||
-- ✅ link principal (melhor url do chunk)
|
||
SOURCE_URL VARCHAR2(2000),
|
||
|
||
CONFIDENCE_WEIGHT NUMBER DEFAULT 1,
|
||
CREATED_AT TIMESTAMP DEFAULT SYSTIMESTAMP
|
||
)
|
||
';
|
||
EXCEPTION WHEN OTHERS THEN
|
||
IF SQLCODE != -955 THEN RAISE; END IF;
|
||
END;
|
||
""")
|
||
|
||
cursor.execute(f"""
|
||
BEGIN
|
||
EXECUTE IMMEDIATE '
|
||
CREATE UNIQUE INDEX KG_EDGE_UNQ_{GRAPH_NAME}
|
||
ON KG_EDGES_{GRAPH_NAME}
|
||
(SOURCE_ID, TARGET_ID, EDGE_TYPE, CHUNK_HASH)
|
||
';
|
||
EXCEPTION WHEN OTHERS THEN
|
||
IF SQLCODE != -955 THEN RAISE; END IF;
|
||
END;
|
||
""")
|
||
|
||
# ---------------------------
|
||
# KG_EVIDENCE (prova)
|
||
# ---------------------------
|
||
cursor.execute(f"""
|
||
BEGIN
|
||
EXECUTE IMMEDIATE '
|
||
CREATE TABLE KG_EVIDENCE_{GRAPH_NAME} (
|
||
ID NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY,
|
||
EDGE_ID NUMBER,
|
||
CHUNK_HASH VARCHAR2(64),
|
||
SOURCE_URL VARCHAR2(2000),
|
||
QUOTE CLOB,
|
||
CREATED_AT TIMESTAMP DEFAULT SYSTIMESTAMP
|
||
)
|
||
';
|
||
EXCEPTION WHEN OTHERS THEN
|
||
IF SQLCODE != -955 THEN RAISE; END IF;
|
||
END;
|
||
""")
|
||
|
||
conn.commit()
|
||
|
||
finally:
|
||
cursor.close()
|
||
|
||
# ---------------------------
|
||
# Migração leve (se seu KG_EDGES já existia antigo)
|
||
# ---------------------------
|
||
edges_table = f"KG_EDGES_{GRAPH_NAME}"
|
||
|
||
if not _col_exists(conn, edges_table, "CHUNK_HASH"):
|
||
cur = conn.cursor()
|
||
cur.execute(f"ALTER TABLE {edges_table} ADD (CHUNK_HASH VARCHAR2(64))")
|
||
conn.commit()
|
||
cur.close()
|
||
ensure_oracle_text_index(conn, f"KG_NODES_{GRAPH_NAME}", "NAME", f"KG_NODES_{GRAPH_NAME}_NAME")
|
||
|
||
print("✅ Graph schema (probatório) ready.")
|
||
|
||
#create_tables_if_not_exist(oracle_conn)
|
||
|
||
# IF GRAPH INDEX PROBLEM, Reindex
|
||
# ensure_oracle_text_index(
|
||
# oracle_conn,
|
||
# "ENTITIES_" + GRAPH_NAME,
|
||
# "NAME",
|
||
# "IDX_ENT_" + GRAPH_NAME + "_NAME"
|
||
# )
|
||
#
|
||
# ensure_oracle_text_index(
|
||
# oracle_conn,
|
||
# "RELATIONS_" + GRAPH_NAME,
|
||
# "RELATION_TYPE",
|
||
# "IDX_REL_" + GRAPH_NAME + "_RELTYPE"
|
||
# )
|
||
|
||
def create_knowledge_graph(chunks):
|
||
|
||
cursor = oracle_conn.cursor()
|
||
inserted_counter = 0
|
||
COMMIT_BATCH = 500
|
||
|
||
# =====================================================
|
||
# 1️⃣ CREATE PROPERTY GRAPH (se não existir)
|
||
# =====================================================
|
||
try:
|
||
cursor.execute(f"""
|
||
BEGIN
|
||
EXECUTE IMMEDIATE '
|
||
CREATE PROPERTY GRAPH {GRAPH_NAME}
|
||
VERTEX TABLES (
|
||
KG_NODES_{GRAPH_NAME}
|
||
KEY (ID)
|
||
LABEL NODE_TYPE
|
||
PROPERTIES (NAME, DESCRIPTION, PROPERTIES)
|
||
)
|
||
EDGE TABLES (
|
||
KG_EDGES_{GRAPH_NAME}
|
||
KEY (ID)
|
||
SOURCE KEY (SOURCE_ID) REFERENCES KG_NODES_{GRAPH_NAME}(ID)
|
||
DESTINATION KEY (TARGET_ID) REFERENCES KG_NODES_{GRAPH_NAME}(ID)
|
||
LABEL EDGE_TYPE
|
||
PROPERTIES (CHUNK_HASH, SOURCE_URL, CONFIDENCE_WEIGHT)
|
||
)
|
||
';
|
||
EXCEPTION
|
||
WHEN OTHERS THEN
|
||
IF SQLCODE NOT IN (-55358, -955) THEN
|
||
RAISE;
|
||
END IF;
|
||
END;
|
||
""")
|
||
print(f"🧠 Graph '{GRAPH_NAME}' ready.")
|
||
except Exception as e:
|
||
print(f"[GRAPH ERROR] {e}")
|
||
|
||
# =====================================================
|
||
# 2️⃣ Helper: MERGE NODE (otimizado)
|
||
# =====================================================
|
||
def build_default_node_properties():
|
||
return {
|
||
"metadata": {
|
||
"created_by": "RFP_AI_V2",
|
||
"version": "2.0",
|
||
"created_at": datetime.utcnow().isoformat()
|
||
},
|
||
"analysis": {
|
||
"confidence_score": None,
|
||
"source": "DOCUMENT_RAG",
|
||
"extraction_method": "LLM_TRIPLE_EXTRACTION"
|
||
},
|
||
"governance": {
|
||
"validated": False,
|
||
"review_required": False
|
||
}
|
||
}
|
||
|
||
|
||
def ensure_node_properties_structure(properties):
|
||
base = build_default_node_properties()
|
||
|
||
if not properties:
|
||
return base
|
||
|
||
def merge(d1, d2):
|
||
for k, v in d1.items():
|
||
if k not in d2:
|
||
d2[k] = v
|
||
elif isinstance(v, dict):
|
||
d2[k] = merge(v, d2.get(k, {}))
|
||
return d2
|
||
|
||
return merge(base, properties)
|
||
|
||
def merge_node(node_type, name, description=None, properties=None):
|
||
|
||
name = (name or "").strip()[:500]
|
||
|
||
# 1️⃣ Try existing
|
||
cursor.execute(f"""
|
||
SELECT ID
|
||
FROM KG_NODES_{GRAPH_NAME}
|
||
WHERE NAME = :name_val
|
||
AND NODE_TYPE = :node_type_val
|
||
""", {
|
||
"name_val": name,
|
||
"node_type_val": node_type
|
||
})
|
||
|
||
row = cursor.fetchone()
|
||
if row:
|
||
return row[0]
|
||
|
||
# 2️⃣ Insert safely
|
||
node_id_var = cursor.var(oracledb.NUMBER)
|
||
|
||
try:
|
||
cursor.execute(f"""
|
||
INSERT INTO KG_NODES_{GRAPH_NAME}
|
||
(NODE_TYPE, NAME, DESCRIPTION, PROPERTIES)
|
||
VALUES (:node_type_val, :name_val, :desc_val, :props_val)
|
||
RETURNING ID INTO :node_id
|
||
""", {
|
||
"node_type_val": node_type,
|
||
"name_val": name,
|
||
"desc_val": description,
|
||
"props_val": json.dumps(
|
||
ensure_node_properties_structure(properties)
|
||
),
|
||
"node_id": node_id_var
|
||
})
|
||
|
||
return int(node_id_var.getvalue()[0])
|
||
|
||
except oracledb.IntegrityError:
|
||
# if unique constraint exists
|
||
cursor.execute(f"""
|
||
SELECT ID
|
||
FROM KG_NODES_{GRAPH_NAME}
|
||
WHERE NAME = :name_val
|
||
AND NODE_TYPE = :node_type_val
|
||
""", {
|
||
"name_val": name,
|
||
"node_type_val": node_type
|
||
})
|
||
row = cursor.fetchone()
|
||
if row:
|
||
return row[0]
|
||
raise
|
||
|
||
def extract_sentence_with_term(text, term):
|
||
sentences = re.split(r'(?<=[.!?]) +', text)
|
||
for s in sentences:
|
||
if term.lower() in s.lower():
|
||
return s.strip()
|
||
return text[:1000]
|
||
|
||
# =====================================================
|
||
# 3️⃣ PROCESS CHUNKS
|
||
# =====================================================
|
||
for doc in chunks:
|
||
|
||
text = doc.page_content or ""
|
||
chunk_hash_value = doc.metadata.get("chunk_hash")
|
||
source_url = resolve_chunk_source(doc)
|
||
|
||
if not text.strip() or not chunk_hash_value:
|
||
continue
|
||
|
||
prompt = f"""
|
||
Extract explicit technical capabilities from the text below.
|
||
|
||
Text:
|
||
{text}
|
||
|
||
Return triples ONLY in format:
|
||
|
||
SERVICE -[SUPPORTS_CAPABILITY]-> CAPABILITY
|
||
SERVICE -[DOES_NOT_SUPPORT]-> CAPABILITY
|
||
SERVICE -[HAS_LIMITATION]-> LIMITATION
|
||
SERVICE -[HAS_SLA]-> SLA_VALUE
|
||
|
||
Rules:
|
||
- Use exact service names if present
|
||
- Use UPPERCASE relation names
|
||
- No inference
|
||
- If none found return NONE
|
||
"""
|
||
|
||
try:
|
||
response = call_llm(llm_for_rag.invoke, prompt)
|
||
result = response.content.strip()
|
||
except Exception as e:
|
||
print(f"[LLM ERROR] {e}")
|
||
continue
|
||
|
||
if result.upper() == "NONE":
|
||
continue
|
||
|
||
triples = result.splitlines()
|
||
|
||
for triple in triples:
|
||
|
||
parts = triple.split("-[")
|
||
if len(parts) != 2:
|
||
continue
|
||
|
||
right = parts[1].split("]->")
|
||
if len(right) != 2:
|
||
continue
|
||
|
||
entity1 = parts[0].strip()
|
||
raw_relation = right[0].strip().upper()
|
||
entity2 = right[1].strip()
|
||
|
||
MAX_NODE_NAME = 500
|
||
|
||
entity1 = entity1[:MAX_NODE_NAME]
|
||
entity2 = entity2[:MAX_NODE_NAME]
|
||
|
||
relation = re.sub(r'\W+', '_', raw_relation)
|
||
|
||
source_id = merge_node(
|
||
node_type="SERVICE",
|
||
name=entity1,
|
||
description=None,
|
||
properties={
|
||
"chunk_hash": chunk_hash_value,
|
||
"source_url": source_url
|
||
}
|
||
)
|
||
|
||
if relation == "DOES_NOT_SUPPORT":
|
||
target_type = "UNSUPPORTED_CAPABILITY"
|
||
elif relation == "HAS_LIMITATION":
|
||
target_type = "LIMITATION"
|
||
elif relation == "HAS_SLA":
|
||
target_type = "SLA"
|
||
else:
|
||
target_type = "CAPABILITY"
|
||
|
||
description_text = extract_sentence_with_term(text, entity2)
|
||
|
||
target_id = merge_node(
|
||
node_type=target_type,
|
||
name=entity2,
|
||
description=description_text
|
||
)
|
||
|
||
# 🔥 Evitar duplicação de edge
|
||
cursor.execute(f"""
|
||
SELECT ID
|
||
FROM KG_EDGES_{GRAPH_NAME}
|
||
WHERE SOURCE_ID = :s
|
||
AND TARGET_ID = :t
|
||
AND EDGE_TYPE = :r
|
||
AND CHUNK_HASH = :h
|
||
""", {
|
||
"s": source_id,
|
||
"t": target_id,
|
||
"r": relation,
|
||
"h": chunk_hash_value
|
||
})
|
||
|
||
if cursor.fetchone():
|
||
continue
|
||
|
||
# =====================================================
|
||
# INSERT EDGE + RETURNING ID
|
||
# =====================================================
|
||
edge_id_var = cursor.var(oracledb.NUMBER)
|
||
|
||
cursor.execute(f"""
|
||
INSERT INTO KG_EDGES_{GRAPH_NAME}
|
||
(SOURCE_ID, TARGET_ID, EDGE_TYPE, CHUNK_HASH, SOURCE_URL, CONFIDENCE_WEIGHT)
|
||
VALUES (:src, :tgt, :rel, :h, :url, :w)
|
||
RETURNING ID INTO :edge_id
|
||
""", {
|
||
"src": source_id,
|
||
"tgt": target_id,
|
||
"rel": relation,
|
||
"h": chunk_hash_value,
|
||
"url": source_url,
|
||
"w": 1,
|
||
"edge_id": edge_id_var
|
||
})
|
||
|
||
edge_id = int(edge_id_var.getvalue()[0])
|
||
|
||
# =====================================================
|
||
# INSERT EVIDENCE
|
||
# =====================================================
|
||
quote = text[:1500]
|
||
|
||
cursor.execute(f"""
|
||
INSERT INTO KG_EVIDENCE_{GRAPH_NAME}
|
||
(EDGE_ID, CHUNK_HASH, SOURCE_URL, QUOTE)
|
||
VALUES (:eid, :h, :url, :q)
|
||
""", {
|
||
"eid": edge_id,
|
||
"h": chunk_hash_value,
|
||
"url": source_url,
|
||
"q": quote
|
||
})
|
||
|
||
inserted_counter += 1
|
||
|
||
print(f"✅ {entity1} -[{relation}]-> {entity2}")
|
||
|
||
# =====================================================
|
||
# COMMIT A CADA 500
|
||
# =====================================================
|
||
if inserted_counter % COMMIT_BATCH == 0:
|
||
oracle_conn.commit()
|
||
print(f"💾 Batch commit ({inserted_counter} records)")
|
||
|
||
# Commit final
|
||
oracle_conn.commit()
|
||
cursor.close()
|
||
|
||
print(f"💾 Knowledge graph updated. Total inserted: {inserted_counter}")
|
||
|
||
def parse_rfp_requirement(question: str) -> dict:
|
||
prompt = f"""
|
||
You are an RFP requirement NORMALIZER for Oracle Cloud Infrastructure (OCI).
|
||
|
||
Your job is NOT to summarize the question.
|
||
Your job is to STRUCTURE the requirement so it can be searched in:
|
||
- Technical documentation
|
||
- Knowledge Graph
|
||
- Vector databases
|
||
|
||
────────────────────────────────
|
||
STEP 1 — Understand the requirement
|
||
────────────────────────────────
|
||
From the question, identify:
|
||
1. The PRIMARY OCI SERVICE CATEGORY involved
|
||
2. The MAIN TECHNICAL SUBJECT (short and precise)
|
||
3. The EXPECTED TECHNICAL CAPABILITY or CONDITION (if any)
|
||
|
||
IMPORTANT:
|
||
- Ignore marketing language
|
||
- Ignore phrases like "possui", "permite", "oferece"
|
||
- Focus ONLY on concrete technical meaning
|
||
|
||
────────────────────────────────
|
||
STEP 2 — Mandatory service classification
|
||
────────────────────────────────
|
||
You MUST choose ONE primary technology from the list below
|
||
and INCLUDE IT EXPLICITLY in the keywords list.
|
||
|
||
Choose the MOST SPECIFIC applicable item.
|
||
|
||
Serviços da Oracle Cloud Infrastructure (OCI):
|
||
|
||
Compute (IaaS)
|
||
• Compute Instances (VM)
|
||
• Bare Metal Instances
|
||
• Dedicated VM Hosts
|
||
• GPU Instances
|
||
• Confidential Computing
|
||
• Capacity Reservations
|
||
• Autoscaling (Instance Pools)
|
||
• Live Migration
|
||
• Oracle Cloud VMware Solution (OCVS)
|
||
• HPC (High Performance Computing)
|
||
• Arm-based Compute (Ampere)
|
||
|
||
Storage
|
||
|
||
Object Storage
|
||
• Object Storage
|
||
• Object Storage – Archive
|
||
• Pre-Authenticated Requests
|
||
• Replication
|
||
|
||
Block & File
|
||
• Block Volume
|
||
• Boot Volume
|
||
• Volume Groups
|
||
• File Storage
|
||
• File Storage Snapshots
|
||
• Data Transfer Service
|
||
|
||
Networking
|
||
• Virtual Cloud Network (VCN)
|
||
• Subnets
|
||
• Internet Gateway
|
||
• NAT Gateway
|
||
• Service Gateway
|
||
• Dynamic Routing Gateway (DRG)
|
||
• FastConnect
|
||
• Load Balancer (L7 / L4)
|
||
• Network Load Balancer
|
||
• DNS
|
||
• Traffic Management Steering Policies
|
||
• IP Address Management (IPAM)
|
||
• Network Firewall
|
||
• Web Application Firewall (WAF)
|
||
• Bastion
|
||
• Capture Traffic (VTAP)
|
||
• Private Endpoints
|
||
|
||
Security, Identity & Compliance
|
||
• Identity and Access Management (IAM)
|
||
• Compartments
|
||
• Policies
|
||
• OCI Vault
|
||
• OCI Key Management (KMS)
|
||
• OCI Certificates
|
||
• OCI Secrets
|
||
• OCI Bastion
|
||
• Cloud Guard
|
||
• Security Zones
|
||
• Vulnerability Scanning Service
|
||
• Data Safe
|
||
• Logging
|
||
• Audit
|
||
• OS Management / OS Management Hub
|
||
• Shielded Instances
|
||
• Zero Trust Packet Routing
|
||
|
||
Databases
|
||
|
||
Autonomous
|
||
• Autonomous Database (ATP)
|
||
• Autonomous Data Warehouse (ADW)
|
||
• Autonomous JSON Database
|
||
|
||
Databases Gerenciados
|
||
• Oracle Database Service
|
||
• Oracle Exadata Database Service
|
||
• Exadata Cloud@Customer
|
||
• Base Database Service
|
||
• MySQL Database Service
|
||
• MySQL HeatWave
|
||
• NoSQL Database Cloud Service
|
||
• TimesTen
|
||
• PostgreSQL (OCI managed)
|
||
• MongoDB API (OCI NoSQL compatibility)
|
||
|
||
Analytics & BI
|
||
• Oracle Analytics Cloud (OAC)
|
||
• OCI Data Catalog
|
||
• OCI Data Integration
|
||
• OCI Streaming Analytics
|
||
• OCI GoldenGate
|
||
• OCI Big Data Service (Hadoop/Spark)
|
||
• OCI Data Science
|
||
• OCI AI Anomaly Detection
|
||
• OCI AI Forecasting
|
||
|
||
AI & Machine Learning
|
||
|
||
Generative AI
|
||
• OCI Generative AI
|
||
• OCI Generative AI Agents
|
||
• OCI Generative AI RAG
|
||
• OCI Generative AI Embeddings
|
||
• OCI AI Gateway (OpenAI-compatible)
|
||
|
||
AI Services
|
||
• OCI Vision (OCR, image analysis)
|
||
• OCI Speech (STT / TTS)
|
||
• OCI Language (NLP)
|
||
• OCI Document Understanding
|
||
• OCI Anomaly Detection
|
||
• OCI Forecasting
|
||
• OCI Data Labeling
|
||
|
||
Containers & Cloud Native
|
||
• OCI Container Engine for Kubernetes (OKE)
|
||
• Container Registry (OCIR)
|
||
• Service Mesh
|
||
• API Gateway
|
||
• OCI Functions (FaaS)
|
||
• OCI Streaming (Kafka-compatible)
|
||
• OCI Queue
|
||
• OCI Events
|
||
• OCI Resource Manager (Terraform)
|
||
|
||
Integration & Messaging
|
||
• OCI Integration Cloud (OIC)
|
||
• OCI Service Connector Hub
|
||
• OCI Streaming
|
||
• OCI GoldenGate
|
||
• OCI API Gateway
|
||
• OCI Events Service
|
||
• OCI Queue
|
||
• Real Applications Clusters (RAC)
|
||
|
||
Developer Services
|
||
• OCI DevOps (CI/CD)
|
||
• OCI Code Repository
|
||
• OCI Build Pipelines
|
||
• OCI Artifact Registry
|
||
• OCI Logging Analytics
|
||
• OCI Monitoring
|
||
• OCI Notifications
|
||
• OCI Bastion
|
||
• OCI CLI
|
||
• OCI SDKs
|
||
|
||
Observability & Management
|
||
• OCI Monitoring
|
||
• OCI Alarms
|
||
• OCI Logging
|
||
• OCI Logging Analytics
|
||
• OCI Application Performance Monitoring (APM)
|
||
• OCI Operations Insights
|
||
• OCI Management Agent
|
||
• OCI Resource Discovery
|
||
|
||
Enterprise & Hybrid
|
||
• Oracle Cloud@Customer
|
||
• Exadata Cloud@Customer
|
||
• Compute Cloud@Customer
|
||
• Dedicated Region Cloud@Customer
|
||
• OCI Roving Edge Infrastructure
|
||
• OCI Alloy
|
||
|
||
Governance & FinOps
|
||
• OCI Budgets
|
||
• Cost Analysis
|
||
• Usage Reports
|
||
• Quotas
|
||
• Tagging
|
||
• Compartments
|
||
• Resource Search
|
||
|
||
Regions & Edge
|
||
• OCI Regions (Commercial, Government, EU Sovereign)
|
||
• OCI Edge Services
|
||
• OCI Roving Edge
|
||
• OCI Dedicated Region
|
||
|
||
────────────────────────────────
|
||
STEP 3 — Keywords rules (CRITICAL)
|
||
────────────────────────────────
|
||
The "keywords" field MUST:
|
||
- ALWAYS include at least ONE OCI service keyword (e.g. "compute", "object storage", "oke")
|
||
- Include technical capability terms (e.g. resize, autoscaling, encryption)
|
||
- NEVER include generic verbs (permitir, possuir, oferecer)
|
||
- NEVER include full sentences
|
||
|
||
────────────────────────────────
|
||
STEP 4 — Output rules
|
||
────────────────────────────────
|
||
Return ONLY valid JSON between <json> tags.
|
||
Do NOT explain your reasoning.
|
||
|
||
Question:
|
||
{question}
|
||
|
||
<json>
|
||
{{
|
||
"requirement_type": "COMPLIANCE | FUNCTIONAL | NON_FUNCTIONAL",
|
||
"subject": "<short technical subject, e.g. 'Compute Instances'>",
|
||
"expected_value": "<technical capability or condition, or empty string>",
|
||
"decision_type": "YES_NO | YES_NO_PARTIAL",
|
||
"keywords": ["mandatory_oci_service", "technical_capability", "additional_term"]
|
||
}}
|
||
</json>
|
||
"""
|
||
|
||
# resp = llm_for_rag.invoke(prompt)
|
||
resp = call_llm(llm_for_rag.invoke, prompt)
|
||
raw = resp.content.strip()
|
||
|
||
try:
|
||
# remove ```json ``` ou ``` ```
|
||
raw = re.sub(r"```json|```", "", raw).strip()
|
||
|
||
match = re.search(r"<json>\s*(\{.*?\})\s*</json>", raw, re.DOTALL)
|
||
if not match:
|
||
raise ValueError("No JSON block found")
|
||
json_text = match.group(1)
|
||
|
||
return json.loads(json_text)
|
||
|
||
except Exception as e:
|
||
print("⚠️ RFP PARSER FAILED")
|
||
print("RAW RESPONSE:")
|
||
print(raw)
|
||
|
||
return {
|
||
"requirement_type": "UNKNOWN",
|
||
"subject": question,
|
||
"expected_value": "",
|
||
"decision_type": "YES_NO_PARTIAL",
|
||
"keywords": re.findall(r"\b\w+\b", question.lower())[:5]
|
||
}
|
||
|
||
def extract_graph_keywords_from_requirement(req: dict) -> str:
|
||
keywords = set(req.get("keywords", []))
|
||
if req.get("subject"):
|
||
keywords.add(req["subject"].lower())
|
||
if req.get("expected_value"):
|
||
keywords.add(str(req["expected_value"]).lower())
|
||
return ", ".join(sorted(keywords))
|
||
|
||
STOPWORDS = {
|
||
"and", "or", "not",
|
||
"de", "da", "do", "das", "dos", "a", "o", "as", "os", "e", "em", "no", "na", "nos", "nas", "para", "por", "com"
|
||
}
|
||
|
||
def build_oracle_text_query(text: str) -> Optional[str]:
|
||
if not text:
|
||
return None
|
||
|
||
text = strip_accents(text.lower())
|
||
|
||
# pega sequências "fraseáveis" (com espaços/hífens)
|
||
phrases = re.findall(r"[a-z0-9][a-z0-9\- ]{2,}", text)
|
||
|
||
tokens: list[str] = []
|
||
|
||
for p in phrases:
|
||
p = p.strip()
|
||
if not p:
|
||
continue
|
||
|
||
# 1) quebra em palavras e hífens (word-level)
|
||
# Ex: "store-and-forward" -> ["store", "and", "forward"]
|
||
words = re.findall(r"[a-z0-9]+", p)
|
||
|
||
# 2) remove stopwords e palavras curtas
|
||
words = [w for w in words if w not in STOPWORDS and len(w) >= 4]
|
||
|
||
if not words:
|
||
continue
|
||
|
||
# 3) recombina
|
||
if len(words) == 1:
|
||
tokens.append(words[0])
|
||
else:
|
||
# se quiser manter hífen, você teria que remontar com '-' e sempre entre aspas
|
||
# aqui eu normalizo pra espaço (mais seguro no Oracle Text)
|
||
tokens.append(f"\"{' '.join(words)}\"")
|
||
|
||
tokens = sorted(set(tokens))
|
||
return " OR ".join(tokens) if tokens else None
|
||
|
||
def detect_negative_conflict(graph_context, req):
|
||
|
||
expected = (req.get("expected_value") or "").lower()
|
||
subject = (req.get("subject") or "").lower()
|
||
|
||
for row in graph_context:
|
||
service, edge_type, target, *_ = row
|
||
|
||
if edge_type == "DOES_NOT_SUPPORT":
|
||
if expected in target.lower() or subject in target.lower():
|
||
return {
|
||
"conflict": True,
|
||
"service": service,
|
||
"capability": target
|
||
}
|
||
|
||
return {"conflict": False}
|
||
|
||
def query_knowledge_graph(raw_keywords: str, top_k: int = 20, min_score: int = 0):
|
||
|
||
cursor = oracle_conn.cursor()
|
||
|
||
safe_query = build_oracle_text_query(raw_keywords)
|
||
if not safe_query:
|
||
cursor.close()
|
||
return []
|
||
|
||
sql = f"""
|
||
select * FROM (
|
||
SELECT
|
||
s.NAME AS service_name,
|
||
e.EDGE_TYPE AS relation_type,
|
||
t.NAME AS target_name,
|
||
e.SOURCE_URL,
|
||
e.CONFIDENCE_WEIGHT,
|
||
CASE
|
||
WHEN CONTAINS(s.NAME, '{safe_query}') > 0 AND CONTAINS(t.NAME, '{safe_query}') > 0 THEN 3
|
||
WHEN CONTAINS(s.NAME, '{safe_query}') > 0 THEN 2
|
||
WHEN CONTAINS(t.NAME, '{safe_query}') > 0 THEN 1
|
||
ELSE 0
|
||
END AS relevance_score
|
||
FROM KG_EDGES_{GRAPH_NAME} e
|
||
JOIN KG_NODES_{GRAPH_NAME} s ON s.ID = e.SOURCE_ID
|
||
JOIN KG_NODES_{GRAPH_NAME} t ON t.ID = e.TARGET_ID
|
||
WHERE s.NODE_TYPE = 'SERVICE'
|
||
AND (
|
||
CONTAINS(t.NAME, '{safe_query}') > 0
|
||
OR CONTAINS(s.NAME, '{safe_query}') > 0
|
||
)
|
||
AND e.CHUNK_HASH NOT IN (
|
||
SELECT CHUNK_HASH
|
||
FROM RAG_CHUNKS_GOV
|
||
WHERE STATUS = 'REVOKED'
|
||
)
|
||
)
|
||
WHERE relevance_score >= {min_score}
|
||
AND CONFIDENCE_WEIGHT > 0
|
||
ORDER BY relevance_score DESC
|
||
FETCH FIRST {top_k} ROWS ONLY
|
||
"""
|
||
|
||
print(sql)
|
||
|
||
cursor.execute(sql)
|
||
rows = cursor.fetchall()
|
||
cursor.close()
|
||
|
||
for row in rows:
|
||
print(row)
|
||
|
||
return rows
|
||
|
||
# RE-RANK
|
||
|
||
def extract_terms_from_graph_text(graph_context):
|
||
if not graph_context:
|
||
return set()
|
||
|
||
if isinstance(graph_context, list):
|
||
terms = set()
|
||
for row in graph_context:
|
||
for col in row:
|
||
if isinstance(col, str):
|
||
terms.add(col.lower())
|
||
return terms
|
||
|
||
if isinstance(graph_context, str):
|
||
terms = set()
|
||
pattern = re.findall(r"([\w\s]+)-$begin:math:display$\[\\w\_\]\+$end:math:display$->([\w\s]+)", graph_context)
|
||
for e1, e2 in pattern:
|
||
terms.add(e1.strip().lower())
|
||
terms.add(e2.strip().lower())
|
||
return terms
|
||
|
||
return set()
|
||
|
||
def rerank_documents_with_graph_terms(docs, query, graph_terms, top_k=12, per_source=2):
|
||
query_terms = set(re.findall(r'\b\w+\b', query.lower()))
|
||
all_terms = query_terms.union(graph_terms)
|
||
|
||
scored = []
|
||
|
||
for doc in docs:
|
||
text = doc.page_content.lower()
|
||
src = (doc.metadata.get("source") or "").lower()
|
||
|
||
term_hits = sum(text.count(t) for t in all_terms)
|
||
|
||
density = term_hits / max(len(text.split()), 1)
|
||
|
||
url_score = score_arch_url(src)
|
||
|
||
score = (term_hits * 2) + (density * 20) + url_score
|
||
|
||
scored.append((score, doc))
|
||
|
||
scored.sort(key=lambda x: x[0], reverse=True)
|
||
|
||
selected = []
|
||
by_source = {}
|
||
|
||
for score, doc in scored:
|
||
src = doc.metadata.get("source")
|
||
|
||
if by_source.get(src, 0) >= per_source:
|
||
continue
|
||
|
||
selected.append(doc)
|
||
by_source[src] = by_source.get(src, 0) + 1
|
||
|
||
if len(selected) >= top_k:
|
||
break
|
||
|
||
return selected
|
||
|
||
def load_processed_hashes_from_graph():
|
||
cursor = oracle_conn.cursor()
|
||
cursor.execute(f"""
|
||
SELECT DISTINCT CHUNK_HASH
|
||
FROM KG_EDGES_{GRAPH_NAME}
|
||
""")
|
||
hashes = {r[0] for r in cursor.fetchall()}
|
||
cursor.close()
|
||
return hashes
|
||
|
||
def rebuild_graph_from_faiss(
|
||
faiss_path: str,
|
||
reverse_resume: bool = True,
|
||
consecutive_threshold: int = 20
|
||
):
|
||
from langchain_community.vectorstores import FAISS
|
||
|
||
print("🔄 Loading FAISS index...")
|
||
|
||
vectorstore = FAISS.load_local(
|
||
faiss_path,
|
||
embeddings,
|
||
allow_dangerous_deserialization=True
|
||
)
|
||
|
||
docs = list(vectorstore.docstore._dict.values())
|
||
print(f"📄 {len(docs)} chunks loaded")
|
||
|
||
if reverse_resume:
|
||
print("🔁 Reverse resume mode active")
|
||
|
||
processed_hashes = load_processed_hashes_from_graph()
|
||
|
||
docs_to_process = []
|
||
consecutive_processed = 0
|
||
|
||
for d in reversed(docs):
|
||
h = d.metadata.get("chunk_hash")
|
||
|
||
if h in processed_hashes:
|
||
consecutive_processed += 1
|
||
if consecutive_processed >= consecutive_threshold:
|
||
print("🛑 Boundary detected. Stopping reverse scan.")
|
||
break
|
||
continue
|
||
else:
|
||
consecutive_processed = 0
|
||
docs_to_process.append(d)
|
||
|
||
docs = list(reversed(docs_to_process))
|
||
print(f"🚀 Will process {len(docs)} chunks")
|
||
|
||
for chunk in tqdm(docs, desc="🧠 Building Graph"):
|
||
create_knowledge_graph([chunk])
|
||
|
||
print("✅ Graph rebuild completed.")
|
||
|
||
# SEMANTIC CHUNKING
|
||
|
||
def split_llm_output_into_chapters(llm_text):
|
||
chapters = []
|
||
current_chapter = []
|
||
lines = llm_text.splitlines()
|
||
|
||
for line in lines:
|
||
if re.match(chapter_separator_regex, line):
|
||
if current_chapter:
|
||
chapters.append("\n".join(current_chapter).strip())
|
||
current_chapter = [line]
|
||
else:
|
||
current_chapter.append(line)
|
||
|
||
if current_chapter:
|
||
chapters.append("\n".join(current_chapter).strip())
|
||
|
||
return chapters
|
||
|
||
|
||
def semantic_chunking(text):
|
||
prompt = f"""
|
||
You received the following text extracted via OCR:
|
||
|
||
{text}
|
||
|
||
Your task:
|
||
1. Identify headings (short uppercase or bold lines, no period at the end) putting the Product Name (Application Name) and the Subject
|
||
2. Separate paragraphs by heading
|
||
3. Indicate columns with [COLUMN 1], [COLUMN 2] if present
|
||
4. Indicate tables with [TABLE] in markdown format
|
||
5. ALWAYS PUT THE URL if there is a Reference
|
||
6. Indicate explicity metrics (if it exists)
|
||
Examples:
|
||
- Oracle Financial Services RTO is 1 hour
|
||
- The Oracle Banking Supply Chain Finance Cloud Service A maximum number of 10K Hosted Transactions
|
||
- The Oracle Banking Payments Cloud Service, Additional Non-Production Environment: You may purchase up to a maximum of ten (10) additional Non-Production Environments
|
||
"""
|
||
|
||
get_out = False
|
||
while not get_out:
|
||
try:
|
||
# response = llm_for_rag.invoke(prompt)
|
||
response = call_llm(llm_for_rag.invoke, prompt)
|
||
get_out = True
|
||
except:
|
||
print("[ERROR] Gen AI call error")
|
||
|
||
return response
|
||
|
||
def read_pdfs(pdf_path):
|
||
if "-ocr" in pdf_path:
|
||
doc_pages = PyMuPDFLoader(str(pdf_path)).load()
|
||
else:
|
||
doc_pages = UnstructuredPDFLoader(str(pdf_path)).load()
|
||
full_text = "\n".join([page.page_content for page in doc_pages])
|
||
return full_text
|
||
|
||
|
||
def smart_split_text(text, max_chunk_size=10_000):
|
||
chunks = []
|
||
start = 0
|
||
text_length = len(text)
|
||
|
||
while start < text_length:
|
||
end = min(start + max_chunk_size, text_length)
|
||
split_point = max(
|
||
text.rfind('.', start, end),
|
||
text.rfind('!', start, end),
|
||
text.rfind('?', start, end),
|
||
text.rfind('\n\n', start, end)
|
||
)
|
||
if split_point == -1 or split_point <= start:
|
||
split_point = end
|
||
else:
|
||
split_point += 1
|
||
|
||
chunk = text[start:split_point].strip()
|
||
if chunk:
|
||
chunks.append(chunk)
|
||
|
||
start = split_point
|
||
|
||
return chunks
|
||
|
||
|
||
def load_previously_indexed_docs():
|
||
if os.path.exists(PROCESSED_DOCS_FILE):
|
||
with open(PROCESSED_DOCS_FILE, "rb") as f:
|
||
return pickle.load(f)
|
||
return set()
|
||
|
||
|
||
def save_indexed_docs(docs):
|
||
with open(PROCESSED_DOCS_FILE, "wb") as f:
|
||
pickle.dump(docs, f)
|
||
|
||
def retrieve_active_docs(query_terms: str, k: int = 50):
|
||
docs = retriever.invoke(query_terms)
|
||
|
||
hashes = [d.metadata.get("chunk_hash") for d in docs if d.metadata.get("chunk_hash")]
|
||
if not hashes:
|
||
return docs[:k]
|
||
|
||
cursor = oracle_conn.cursor()
|
||
cursor.execute("""
|
||
SELECT chunk_hash
|
||
FROM RAG_CHUNKS_GOV
|
||
WHERE chunk_hash IN (SELECT COLUMN_VALUE FROM TABLE(:hashes))
|
||
AND status = 'ACTIVE'
|
||
""", {"hashes": hashes})
|
||
|
||
active = {r[0] for r in cursor.fetchall()}
|
||
cursor.close()
|
||
|
||
return [d for d in docs if d.metadata.get("chunk_hash") in active][:k]
|
||
|
||
URL_REGEX = re.compile(r"https?://[^\s\)\]\}<>\"']+", re.IGNORECASE)
|
||
|
||
def best_url(urls):
|
||
def score(u):
|
||
u = u.lower()
|
||
s = 0
|
||
|
||
if "docs.oracle.com" in u:
|
||
s += 3
|
||
|
||
if any(x in u for x in [
|
||
"compute","database","oke","storage","network","security"
|
||
]):
|
||
s += 5
|
||
|
||
if any(x in u for x in [
|
||
"overview","architecture","concepts","how-to","use-case"
|
||
]):
|
||
s += 4
|
||
|
||
if any(x in u for x in [
|
||
"home","index","portal","release-notes","faq"
|
||
]):
|
||
s -= 10
|
||
|
||
return s
|
||
|
||
return max(urls, key=score)
|
||
|
||
def resolve_chunk_source(doc):
|
||
text = doc.page_content or ""
|
||
md = doc.metadata or {}
|
||
|
||
# 🔥 1) URLs reais dentro do conteúdo
|
||
text_urls = URL_REGEX.findall(text)
|
||
|
||
# remove urls genéricas
|
||
text_urls = [
|
||
u for u in text_urls
|
||
if not any(x in u.lower() for x in ["home", "index", "portal"])
|
||
]
|
||
|
||
if text_urls:
|
||
return best_url(text_urls)
|
||
|
||
# 🔥 2) reference (geralmente melhor que source)
|
||
ref = md.get("reference")
|
||
if ref and ref.startswith("http"):
|
||
return ref
|
||
|
||
# 🔥 3) source
|
||
src = md.get("source")
|
||
if src and src.startswith("http"):
|
||
return src
|
||
|
||
return "Oracle Cloud Infrastructure documentation"
|
||
|
||
def extract_first_url_from_chunk(*texts: str) -> str | None:
|
||
"""
|
||
Recebe múltiplos textos (chunk, metadata, etc) e retorna a PRIMEIRA URL encontrada.
|
||
"""
|
||
for text in texts:
|
||
if not text:
|
||
continue
|
||
m = URL_REGEX.search(text)
|
||
if m:
|
||
return m.group(0)
|
||
return None
|
||
|
||
def aggregate_chunks_by_source(docs):
|
||
buckets = {}
|
||
|
||
for d in docs:
|
||
md = d.metadata or {}
|
||
key = (
|
||
extract_first_url_from_chunk(
|
||
d.page_content,
|
||
md.get("reference", ""),
|
||
md.get("source", "")
|
||
)
|
||
or md.get("reference")
|
||
or md.get("source")
|
||
or "UNKNOWN"
|
||
)
|
||
|
||
buckets.setdefault(key, []).append(d.page_content)
|
||
|
||
return buckets
|
||
|
||
def search_active_chunks(statement: str, k: int = 3000):
|
||
docs = retriever.invoke(statement)
|
||
|
||
hashes = [
|
||
d.metadata.get("chunk_hash")
|
||
for d in docs
|
||
if d.metadata.get("chunk_hash")
|
||
]
|
||
|
||
if not hashes:
|
||
return docs[:k]
|
||
|
||
in_clause = ",".join(f"'{h}'" for h in hashes)
|
||
|
||
sql = f"""
|
||
SELECT chunk_hash
|
||
FROM RAG_CHUNKS_GOV
|
||
WHERE status = 'ACTIVE'
|
||
AND chunk_hash IN ({in_clause})
|
||
"""
|
||
|
||
cursor = oracle_conn.cursor()
|
||
cursor.execute(sql)
|
||
|
||
active_hashes = {r[0] for r in cursor.fetchall()}
|
||
cursor.close()
|
||
|
||
final_docs = []
|
||
for d in docs:
|
||
h = d.metadata.get("chunk_hash")
|
||
if h in active_hashes:
|
||
d.metadata["source"] = resolve_chunk_source(d)
|
||
final_docs.append(d)
|
||
|
||
return final_docs[:k]
|
||
|
||
def search_manual_chunks_by_text(statement: str):
|
||
sql = """
|
||
SELECT
|
||
chunk_hash,
|
||
source,
|
||
created_at,
|
||
origin,
|
||
status
|
||
FROM rag_chunks_gov
|
||
WHERE status = 'ACTIVE'
|
||
AND (
|
||
LOWER(source) LIKE '%' || LOWER(:q) || '%'
|
||
OR LOWER(origin) LIKE '%' || LOWER(:q) || '%'
|
||
OR LOWER(chunk_hash) LIKE '%' || LOWER(:q) || '%'
|
||
)
|
||
ORDER BY created_at DESC
|
||
FETCH FIRST 50 ROWS ONLY \
|
||
"""
|
||
|
||
cursor = oracle_conn.cursor()
|
||
cursor.execute(sql, {"q": statement})
|
||
rows = cursor.fetchall()
|
||
cursor.close()
|
||
|
||
from langchain.schema import Document
|
||
|
||
docs = []
|
||
for h, source, created_at, origin, status in rows:
|
||
docs.append(
|
||
Document(
|
||
page_content="",
|
||
metadata={
|
||
"chunk_hash": h,
|
||
"source": source,
|
||
"created_at": created_at,
|
||
"origin": origin,
|
||
"status": status,
|
||
}
|
||
)
|
||
)
|
||
|
||
return docs
|
||
|
||
def search_chunks_for_invalidation(statement: str, k: int = 3000):
|
||
results = []
|
||
|
||
manual_chunks = search_manual_chunks_by_text(statement)
|
||
results.extend(manual_chunks)
|
||
|
||
semantic_chunks = search_active_chunks(statement, k)
|
||
|
||
seen = set()
|
||
final = []
|
||
|
||
for d in results + semantic_chunks:
|
||
h = d.metadata.get("chunk_hash")
|
||
if h and h not in seen:
|
||
seen.add(h)
|
||
final.append(d)
|
||
|
||
return final
|
||
|
||
def revoke_chunk_by_hash(chunk_hash: str, reason: str):
|
||
cursor = oracle_conn.cursor()
|
||
cursor.execute("""
|
||
UPDATE RAG_CHUNKS_GOV
|
||
SET status = 'REVOKED',
|
||
revoked_at = SYSTIMESTAMP,
|
||
revocation_reason = :reason
|
||
WHERE chunk_hash = :h
|
||
""", {"h": chunk_hash, "reason": reason})
|
||
|
||
cursor.execute(f"""
|
||
UPDATE KG_EDGES_{GRAPH_NAME}
|
||
SET CONFIDENCE_WEIGHT = 0
|
||
WHERE SOURCE_URL = :h
|
||
""", {"h": chunk_hash})
|
||
|
||
|
||
oracle_conn.commit()
|
||
cursor.close()
|
||
|
||
def get_chunk_metadata(chunk_hashes: list[str]) -> dict:
|
||
if not chunk_hashes:
|
||
return {}
|
||
|
||
cursor = oracle_conn.cursor()
|
||
|
||
cursor.execute(f"""
|
||
SELECT
|
||
CHUNK_HASH,
|
||
ORIGIN,
|
||
CREATED_AT,
|
||
STATUS
|
||
FROM RAG_CHUNKS_GOV
|
||
WHERE CHUNK_HASH IN ({",".join([f":h{i}" for i in range(len(chunk_hashes))])})
|
||
""", {f"h{i}": h for i, h in enumerate(chunk_hashes)})
|
||
|
||
rows = cursor.fetchall()
|
||
cursor.close()
|
||
|
||
return {
|
||
r[0]: {
|
||
"origin": r[1],
|
||
"created_at": r[2],
|
||
"status": r[3]
|
||
}
|
||
for r in rows
|
||
}
|
||
|
||
def add_manual_knowledge_entry(
|
||
*,
|
||
text: str,
|
||
author: str,
|
||
reason: str,
|
||
source: str = "MANUAL_INPUT",
|
||
origin: str = "MANUAL",
|
||
index_path: str = INDEX_PATH,
|
||
also_update_graph: bool = True,
|
||
) -> str:
|
||
text = (text or "").strip()
|
||
reason = (reason or "").strip()
|
||
author = (author or "").strip() or "unknown"
|
||
|
||
h = chunk_hash(text)
|
||
|
||
doc = Document(
|
||
page_content=text,
|
||
metadata={
|
||
"source": source,
|
||
"author": author,
|
||
"reason": reason,
|
||
"origin": origin,
|
||
"created_at": datetime.utcnow().isoformat(),
|
||
"chunk_hash": h,
|
||
},
|
||
)
|
||
|
||
cur = oracle_conn.cursor()
|
||
cur.execute(
|
||
"""
|
||
MERGE INTO RAG_CHUNKS_GOV g
|
||
USING (SELECT :h AS h FROM dual) src
|
||
ON (g.CHUNK_HASH = src.h)
|
||
WHEN NOT MATCHED THEN
|
||
INSERT (CHUNK_HASH, SOURCE, STATUS, CREATED_AT, ORIGIN)
|
||
VALUES (:h, :src, 'ACTIVE', SYSTIMESTAMP, :origin)
|
||
""",
|
||
{"h": h, "src": source, "origin": origin},
|
||
)
|
||
oracle_conn.commit()
|
||
cur.close()
|
||
|
||
try:
|
||
vs = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
|
||
vs.add_documents([doc])
|
||
except Exception:
|
||
vs = FAISS.from_documents([doc], embedding=embeddings)
|
||
|
||
vs.save_local(index_path)
|
||
|
||
if also_update_graph:
|
||
try:
|
||
create_knowledge_graph([doc])
|
||
except Exception:
|
||
pass
|
||
|
||
return h
|
||
|
||
def build_structured_evidence(docs, max_chunks=150):
|
||
evidence = []
|
||
|
||
for d in docs[:max_chunks]:
|
||
quote = d.page_content[:3000]
|
||
|
||
# 🔥 remove qualquer Reference textual
|
||
quote = re.sub(r"Reference:\s*\S+", "", quote)
|
||
|
||
evidence.append({
|
||
"quote": quote,
|
||
"source": resolve_chunk_source(d)
|
||
})
|
||
|
||
return evidence
|
||
|
||
def get_context_from_requirement(req: dict):
|
||
query_terms = extract_graph_keywords_from_requirement(req)
|
||
|
||
docs = search_active_chunks(query_terms)
|
||
|
||
graph_context = query_knowledge_graph(query_terms, top_k=50, min_score=1)
|
||
|
||
neg = detect_negative_conflict(graph_context, req)
|
||
|
||
if neg["conflict"]:
|
||
print("⚠️ Negative capability found in graph.")
|
||
graph_context.append((
|
||
neg["service"],
|
||
"NEGATIVE_CONFLICT_DETECTED",
|
||
neg["capability"],
|
||
None,
|
||
999
|
||
))
|
||
|
||
graph_terms = extract_terms_from_graph_text(graph_context)
|
||
reranked_chunks = rerank_documents_with_graph_terms(
|
||
docs,
|
||
query_terms,
|
||
graph_terms,
|
||
top_k=30
|
||
)
|
||
structured_evidence = build_structured_evidence(reranked_chunks)
|
||
|
||
return {
|
||
"text_context": structured_evidence,
|
||
"graph_context": graph_context,
|
||
"requirement_type": req["requirement_type"],
|
||
"subject": req["subject"],
|
||
"expected_value": req.get("expected_value", "")
|
||
}
|
||
|
||
# =========================
|
||
# Main Function
|
||
# =========================
|
||
def chat():
|
||
PDF_FOLDER = Path("docs") # pasta onde estão os PDFs
|
||
|
||
pdf_paths = sorted(
|
||
str(p) for p in PDF_FOLDER.glob("*.pdf")
|
||
)
|
||
|
||
already_indexed_docs = load_previously_indexed_docs()
|
||
updated_docs = set()
|
||
|
||
try:
|
||
vectorstore = FAISS.load_local(INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
|
||
print("✔️ FAISS index loaded.")
|
||
except Exception:
|
||
print("⚠️ FAISS index not found, creating a new one.")
|
||
vectorstore = None
|
||
|
||
new_chunks = []
|
||
|
||
for pdf_path in tqdm(pdf_paths, desc=f"📄 Processing PDFs"):
|
||
print(f" {os.path.basename(pdf_path)}")
|
||
if pdf_path in already_indexed_docs:
|
||
print(f"✅ Document already indexed: {pdf_path}")
|
||
continue
|
||
full_text = read_pdfs(pdf_path=pdf_path)
|
||
path_url = filename_to_url(os.path.basename(pdf_path))
|
||
|
||
text_chunks = smart_split_text(full_text, max_chunk_size=10_000)
|
||
overflow_buffer = ""
|
||
|
||
for chunk in tqdm(text_chunks, desc=f"📄 Processing text chunks", dynamic_ncols=True, leave=False):
|
||
current_text = overflow_buffer + chunk
|
||
|
||
treated_text = semantic_chunking(current_text)
|
||
|
||
if hasattr(treated_text, "content"):
|
||
chapters = split_llm_output_into_chapters(treated_text.content)
|
||
|
||
last_chapter = chapters[-1] if chapters else ""
|
||
|
||
if last_chapter and not last_chapter.strip().endswith((".", "!", "?", "\n\n")):
|
||
print("📌 Last chapter seems incomplete, saving for the next cycle")
|
||
overflow_buffer = last_chapter
|
||
chapters = chapters[:-1]
|
||
else:
|
||
overflow_buffer = ""
|
||
|
||
for chapter_text in chapters:
|
||
reference_url = "Reference: " + path_url
|
||
chapter_text = chapter_text + "\n" + reference_url
|
||
# doc = Document(page_content=chapter_text, metadata={"source": pdf_path, "reference": reference_url})
|
||
|
||
h = chunk_hash(chapter_text)
|
||
|
||
cursor = oracle_conn.cursor()
|
||
|
||
cursor.execute("""
|
||
MERGE INTO RAG_CHUNKS_GOV g
|
||
USING (SELECT :h AS h FROM dual) src
|
||
ON (g.CHUNK_HASH = src.h)
|
||
WHEN NOT MATCHED THEN
|
||
INSERT (
|
||
CHUNK_HASH,
|
||
SOURCE,
|
||
STATUS,
|
||
CREATED_AT,
|
||
ORIGIN
|
||
)
|
||
VALUES (
|
||
:h,
|
||
:src,
|
||
'ACTIVE',
|
||
SYSTIMESTAMP,
|
||
:origin
|
||
)
|
||
""", {
|
||
"h": h,
|
||
"src": pdf_path,
|
||
"origin": "PDF"
|
||
})
|
||
oracle_conn.commit()
|
||
cursor.close()
|
||
|
||
doc = Document(
|
||
page_content=chapter_text,
|
||
metadata={
|
||
"source": pdf_path,
|
||
"reference": reference_url,
|
||
"chunk_hash": h,
|
||
"created_at": datetime.utcnow().isoformat()
|
||
}
|
||
)
|
||
|
||
new_chunks.append(doc)
|
||
print(f"✅ New chapter indexed:\n{chapter_text}...\n")
|
||
|
||
else:
|
||
print(f"[ERROR] semantic_chunking returned unexpected type: {type(treated_text)}")
|
||
|
||
updated_docs.add(str(pdf_path))
|
||
|
||
if new_chunks:
|
||
if vectorstore:
|
||
vectorstore.add_documents(new_chunks)
|
||
else:
|
||
vectorstore = FAISS.from_documents(new_chunks, embedding=embeddings)
|
||
|
||
vectorstore.save_local(INDEX_PATH)
|
||
save_indexed_docs(already_indexed_docs.union(updated_docs))
|
||
print(f"💾 {len(new_chunks)} chunks added to FAISS index.")
|
||
|
||
print("🧠 Building knowledge graph...")
|
||
create_knowledge_graph(new_chunks)
|
||
|
||
else:
|
||
print("📁 No new documents to index.")
|
||
|
||
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 50, "fetch_k": 100})
|
||
|
||
RFP_DECISION_TEMPLATE = """
|
||
You are answering an RFP requirement with risk awareness.
|
||
|
||
You MUST validate the answer ONLY using CAPABILITY nodes returned in GRAPH FACTS.
|
||
If no capability exists, answer MUST be "UNKNOWN".
|
||
|
||
Requirement:
|
||
Type: {requirement_type}
|
||
Subject: {subject}
|
||
Expected value: {expected_value}
|
||
|
||
**Document evidence**:
|
||
{text_context}
|
||
|
||
**Graph evidence**:
|
||
{graph_context}
|
||
|
||
Decision rules:
|
||
- Answer ONLY with YES, NO or PARTIAL
|
||
- If value differs, answer PARTIAL
|
||
- If not found, answer NO
|
||
|
||
Interpretation rules (MANDATORY):
|
||
- If a capability is supported but requires reboot, downtime, or restart, it STILL counts as YES unless the requirement explicitly forbids it.
|
||
- "Servidor em funcionamento" means the resource exists and is active before the operation, not that it must remain online without interruption.
|
||
- Only answer NO if the operation is NOT supported at all or requires destroying and recreating the resource.
|
||
- Reboot, restart, or brief unavailability MUST NOT be interpreted as lack of support.
|
||
|
||
Confidence rules:
|
||
- HIGH: Explicit evidence directly answers the requirement
|
||
- MEDIUM: Evidence partially matches or requires light interpretation
|
||
- LOW: Requirement is ambiguous OR evidence is indirect OR missing
|
||
|
||
Ambiguity rules:
|
||
- ambiguity_detected = true if:
|
||
- The requirement can be interpreted in more than one way
|
||
- Keywords are vague (e.g. "support", "integration", "capability")
|
||
- Evidence does not clearly bind to subject + expected value
|
||
|
||
Service scope rules (MANDATORY):
|
||
- Evidence is valid ONLY if it refers to the SAME service category as the requirement.
|
||
- Do NOT use evidence from a different Oracle Cloud service to justify another.
|
||
- PREFER ALWAYS URL to source
|
||
|
||
SOURCE RULE:
|
||
- GIVE MANY SOURCES URL
|
||
- GIVE A COMPLETE PATH OF URL SOURCES TO UNDERSTAND THE CONCEPTS THEME
|
||
- GIVE one or more OVERVIEW SOURCE URL
|
||
- GIVE one or more SOLUTION AND ARCHITECTURE SOURCE URL
|
||
|
||
OUTPUT CONSTRAINTS (MANDATORY):
|
||
- Return ONLY a valid JSON object
|
||
- Do NOT include explanations, comments, markdown, lists, or code fences
|
||
- Do NOT write any text before or after the JSON
|
||
- The response must start with an opening curly brace and end with a closing curly brace
|
||
|
||
LANGUAGE RULE (MANDATORY): **DO TRANSLATION AS THE LAST STEP**
|
||
- Write ALL textual values in {lang}
|
||
- Keep JSON keys in English
|
||
- Do NOT translate keys
|
||
|
||
JSON schema (return exactly this structure):
|
||
{{
|
||
"answer": "YES | NO | PARTIAL",
|
||
"confidence": "HIGH | MEDIUM | LOW",
|
||
"ambiguity_detected": true,
|
||
"confidence_reason": "<short reason>",
|
||
"justification": "<short factual explanation>",
|
||
"evidence": [
|
||
{{ "quote": "...",
|
||
"source": "..." }},
|
||
{{ "quote": "...",
|
||
"source": "..." }},
|
||
{{ "quote": "...",
|
||
"source": "..." }},
|
||
]
|
||
}}
|
||
"""
|
||
prompt = PromptTemplate.from_template(RFP_DECISION_TEMPLATE)
|
||
|
||
chain = (
|
||
RunnableLambda(lambda q: {
|
||
"question": q,
|
||
"req": parse_rfp_requirement(q)
|
||
})
|
||
| RunnableMap({
|
||
"text_context": lambda x: get_context_from_requirement(x["req"])["text_context"],
|
||
"graph_context": lambda x: get_context_from_requirement(x["req"])["graph_context"],
|
||
"requirement_type": lambda x: x["req"]["requirement_type"],
|
||
"subject": lambda x: x["req"]["subject"],
|
||
"expected_value": lambda x: x["req"].get("expected_value", ""),
|
||
"lang": lambda x: normalize_lang(detect(x["question"]))
|
||
})
|
||
| prompt
|
||
| llm
|
||
| StrOutputParser()
|
||
)
|
||
|
||
print("✅ READY")
|
||
|
||
while True:
|
||
query = input("❓ Question (or 'quit' to exit): ")
|
||
if query.lower() == "quit":
|
||
break
|
||
# response = chain.invoke(query)
|
||
response = answer_question_with_retries(query, max_attempts=3)
|
||
|
||
print("\n📜 RESPONSE:\n")
|
||
print(response)
|
||
print("\n" + "=" * 80 + "\n")
|
||
|
||
|
||
def safe_parse_llm_answer(raw: str) -> dict:
|
||
try:
|
||
raw = raw.replace("```json", "").replace("```", "").strip()
|
||
return json.loads(raw)
|
||
except Exception:
|
||
return {
|
||
"answer": "NO",
|
||
"confidence": "LOW",
|
||
"confidence_reason": "Invalid JSON from LLM",
|
||
"justification": "",
|
||
"evidence": []
|
||
}
|
||
|
||
def answer_question_with_retries(
|
||
question: str,
|
||
max_attempts: int = 3
|
||
) -> dict:
|
||
|
||
best = None
|
||
best_score = -1
|
||
|
||
# 🔥 Importante: precisamos do graph_context aqui
|
||
req = parse_rfp_requirement(question)
|
||
graph_context = query_knowledge_graph(
|
||
extract_graph_keywords_from_requirement(req),
|
||
top_k=20
|
||
)
|
||
|
||
for attempt in range(1, max_attempts + 1):
|
||
|
||
raw = answer_question(question)
|
||
parsed = safe_parse_llm_answer(raw)
|
||
|
||
# =====================================================
|
||
# 🔥 AQUI entra o tratamento de conflito negativo
|
||
# =====================================================
|
||
if parsed.get("answer") == "YES":
|
||
for row in graph_context:
|
||
service, edge_type, target, *_ = row
|
||
|
||
if edge_type in ("DOES_NOT_SUPPORT", "NEGATIVE_CONFLICT_DETECTED"):
|
||
print("❌ Conflict detected — forcing downgrade to NO")
|
||
|
||
parsed["answer"] = "NO"
|
||
parsed["confidence"] = "HIGH"
|
||
parsed["confidence_reason"] = \
|
||
"Graph contains explicit negative capability"
|
||
|
||
break
|
||
|
||
# =====================================================
|
||
|
||
ans = parsed.get("answer", "NO")
|
||
conf = parsed.get("confidence", "LOW")
|
||
|
||
score = score_answer(parsed)
|
||
|
||
print(
|
||
f"🔁 Attempt {attempt}: "
|
||
f"answer={ans} confidence={conf} score={score}"
|
||
)
|
||
|
||
if score > best_score:
|
||
best = parsed
|
||
best_score = score
|
||
|
||
# condição de parada ideal
|
||
if ans == "YES" and conf == "HIGH":
|
||
print("✅ Optimal answer found (YES/HIGH)")
|
||
return parsed
|
||
|
||
print("⚠️ Optimal answer not found, returning best available")
|
||
|
||
best = validate_and_sanitize_sources(best)
|
||
|
||
return best
|
||
|
||
# =========================
|
||
# ARCHITECTURE SOURCE VALIDATION
|
||
# =========================
|
||
|
||
def _sanitize_source_field(src):
|
||
"""
|
||
Normaliza string ou lista de URLs.
|
||
"""
|
||
if not src:
|
||
return INVALID_SOURCE_TOKEN
|
||
|
||
if isinstance(src, list):
|
||
cleaned = []
|
||
for s in src:
|
||
cleaned.append(s if url_exists(s) else INVALID_SOURCE_TOKEN)
|
||
return cleaned
|
||
|
||
return src if url_exists(src) else INVALID_SOURCE_TOKEN
|
||
|
||
|
||
def validate_architecture_sources(plan: dict) -> dict:
|
||
if not plan:
|
||
return plan
|
||
|
||
# -------------------------
|
||
# components[].source
|
||
# -------------------------
|
||
comps = plan.get("architecture", {}).get("components", [])
|
||
|
||
for c in comps:
|
||
c["source"] = _sanitize_source_field(c.get("source"))
|
||
|
||
# -------------------------
|
||
# decisions[].evidence.source
|
||
# -------------------------
|
||
decisions = plan.get("decisions", [])
|
||
|
||
for d in decisions:
|
||
ev = d.get("evidence", {}) or {}
|
||
ev["source"] = _sanitize_source_field(ev.get("source"))
|
||
d["evidence"] = ev
|
||
|
||
return plan
|
||
|
||
|
||
# =========================
|
||
# LOADERS
|
||
# =========================
|
||
|
||
def load_all():
|
||
global vectorstore, retriever, graph, chain, RFP_DECISION_TEMPLATE, chain_architecture
|
||
print("🔄 Loading FAISS + Graph + Chain...")
|
||
|
||
try:
|
||
vectorstore = FAISS.load_local(
|
||
INDEX_PATH,
|
||
embeddings,
|
||
allow_dangerous_deserialization=True
|
||
)
|
||
|
||
retriever = vectorstore.as_retriever(
|
||
search_type="similarity",
|
||
search_kwargs={"k": 50, "fetch_k": 100}
|
||
)
|
||
except:
|
||
print("No Faiss")
|
||
|
||
RFP_DECISION_TEMPLATE = """
|
||
You are answering an RFP requirement with risk awareness.
|
||
|
||
Requirement:
|
||
Type: {requirement_type}
|
||
Subject: {subject}
|
||
Expected value: {expected_value}
|
||
|
||
Document evidence:
|
||
{text_context}
|
||
|
||
Graph evidence:
|
||
{graph_context}
|
||
|
||
Decision rules:
|
||
- Answer ONLY with YES, NO or PARTIAL
|
||
- If value differs, answer PARTIAL
|
||
- If not found, answer NO
|
||
|
||
Interpretation rules (MANDATORY):
|
||
- If a capability is supported but requires reboot, downtime, or restart, it STILL counts as YES unless the requirement explicitly forbids it.
|
||
- "Servidor em funcionamento" means the resource exists and is active before the operation, not that it must remain online without interruption.
|
||
- Only answer NO if the operation is NOT supported at all or requires destroying and recreating the resource.
|
||
- Reboot, restart, or brief unavailability MUST NOT be interpreted as lack of support.
|
||
|
||
Confidence rules:
|
||
- HIGH: Explicit evidence directly answers the requirement
|
||
- MEDIUM: Evidence partially matches or requires light interpretation
|
||
- LOW: Requirement is ambiguous OR evidence is indirect OR missing
|
||
|
||
Ambiguity rules:
|
||
- ambiguity_detected = true if:
|
||
- The requirement can be interpreted in more than one way
|
||
- Keywords are vague (e.g. "support", "integration", "capability")
|
||
- Evidence does not clearly bind to subject + expected value
|
||
|
||
Service scope rules (MANDATORY):
|
||
- Do NOT use evidence from a different Oracle Cloud service to justify another.
|
||
|
||
SOURCE RULE:
|
||
- GIVE MANY SOURCES URL
|
||
- GIVE A COMPLETE PATH OF URL SOURCES TO UNDERSTAND THE CONCEPTS THEME
|
||
- GIVE one or more OVERVIEW SOURCE URL
|
||
- GIVE one or more SOLUTION AND ARCHITECTURE SOURCE URL
|
||
|
||
OUTPUT CONSTRAINTS (MANDATORY):
|
||
- Return ONLY a valid JSON object
|
||
- Do NOT include explanations, comments, markdown, lists, or code fences
|
||
- Do NOT write any text before or after the JSON
|
||
- The response must start with an opening curly brace and end with a closing curly brace
|
||
|
||
LANGUAGE RULE (MANDATORY): **DO TRANSLATION AS THE LAST STEP**
|
||
- Write ALL textual values in {lang}
|
||
- Keep JSON keys in English
|
||
- Do NOT translate keys
|
||
|
||
JSON schema (return exactly this structure):
|
||
{{
|
||
"answer": "YES | NO | PARTIAL",
|
||
"confidence": "HIGH | MEDIUM | LOW",
|
||
"ambiguity_detected": true,
|
||
"confidence_reason": "<short reason>",
|
||
"justification": "<short factual explanation>",
|
||
"evidence": [
|
||
{{ "quote": "...",
|
||
"source": "..." }},
|
||
{{ "quote": "...",
|
||
"source": "..." }},
|
||
{{ "quote": "...",
|
||
"source": "..." }},
|
||
]
|
||
|
||
}}
|
||
"""
|
||
prompt = PromptTemplate.from_template(RFP_DECISION_TEMPLATE)
|
||
|
||
chain = (
|
||
RunnableLambda(lambda q: {
|
||
"question": q,
|
||
"req": parse_rfp_requirement(q)
|
||
})
|
||
| RunnableMap({
|
||
"text_context": lambda x: get_context_from_requirement(x["req"])["text_context"],
|
||
"graph_context": lambda x: get_context_from_requirement(x["req"])["graph_context"],
|
||
"requirement_type": lambda x: x["req"]["requirement_type"],
|
||
"subject": lambda x: x["req"]["subject"],
|
||
"expected_value": lambda x: x["req"].get("expected_value", ""),
|
||
"lang": lambda x: normalize_lang(detect(x["question"]))
|
||
})
|
||
| prompt
|
||
| llm
|
||
| StrOutputParser()
|
||
)
|
||
|
||
chain_architecture = build_architecture_chain()
|
||
|
||
print("✅ Loaded!")
|
||
|
||
def answer_question(
|
||
question: str,
|
||
max_attempts: int = MAX_ATTEMPTS
|
||
) -> str:
|
||
|
||
def worker(i):
|
||
try:
|
||
# raw = chain.invoke(question)
|
||
raw = call_llm(chain.invoke, question)
|
||
|
||
parsed = safe_parse_llm_answer(raw)
|
||
score = score_answer(parsed)
|
||
|
||
return {
|
||
"attempt": i,
|
||
"raw": raw,
|
||
"parsed": parsed,
|
||
"score": score
|
||
}
|
||
|
||
except Exception as e:
|
||
print(f"❌ Attempt {i} failed: {e}")
|
||
|
||
return {
|
||
"attempt": i,
|
||
"raw": "",
|
||
"parsed": {},
|
||
"score": -1
|
||
}
|
||
|
||
results = []
|
||
|
||
with ThreadPoolExecutor(max_workers=max_attempts) as executor:
|
||
futures = [
|
||
executor.submit(worker, i)
|
||
for i in range(1, max_attempts + 1)
|
||
]
|
||
|
||
for f in as_completed(futures):
|
||
try:
|
||
r = f.result()
|
||
if r["score"] >= 0:
|
||
results.append(r)
|
||
except Exception as e:
|
||
print("❌ Future crashed:", e)
|
||
|
||
# 🔥 fallback absoluto
|
||
if not results:
|
||
print("⚠️ All attempts failed — returning safe fallback")
|
||
return json.dumps({
|
||
"answer": "NO",
|
||
"confidence": "LOW",
|
||
"ambiguity_detected": True,
|
||
"confidence_reason": "All LLM attempts failed",
|
||
"justification": "",
|
||
"evidence": []
|
||
})
|
||
|
||
results.sort(key=lambda r: r["score"], reverse=True)
|
||
|
||
for r in results:
|
||
print(
|
||
f"⚡ Attempt {r['attempt']} | "
|
||
f"answer={r['parsed'].get('answer')} | "
|
||
f"confidence={r['parsed'].get('confidence')} | "
|
||
f"score={r['score']}"
|
||
)
|
||
|
||
best = results[0]
|
||
|
||
print(f"\n🏆 Selected attempt {best['attempt']} (score={best['score']})")
|
||
|
||
#return best["raw"]
|
||
sanitized = validate_and_sanitize_sources(best["parsed"])
|
||
return json.dumps(sanitized)
|
||
|
||
|
||
def reload_all():
|
||
load_all()
|
||
|
||
reload_all()
|
||
|
||
# 🚀 Run
|
||
if __name__ == "__main__":
|
||
chat()
|
||
#rebuild_graph_from_faiss(INDEX_PATH, reverse_resume=True)
|