This commit is contained in:
2026-01-14 09:44:22 -03:00
parent 1dce8e20d3
commit 51cf883150
2 changed files with 191 additions and 77 deletions

BIN
.DS_Store vendored

Binary file not shown.

View File

@@ -7,7 +7,7 @@ from langchain.schema.runnable import RunnableMap
from langchain_community.document_loaders import UnstructuredPDFLoader, PyMuPDFLoader from langchain_community.document_loaders import UnstructuredPDFLoader, PyMuPDFLoader
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda from langchain_core.runnables import RunnableLambda
from pathlib import Path
from tqdm import tqdm from tqdm import tqdm
import os import os
import pickle import pickle
@@ -15,6 +15,7 @@ import re
import atexit import atexit
import oracledb import oracledb
import json import json
import base64
# ========================= # =========================
# Oracle Autonomous Configuration # Oracle Autonomous Configuration
@@ -22,9 +23,8 @@ import json
WALLET_PATH = "Wallet_oradb23ai" WALLET_PATH = "Wallet_oradb23ai"
DB_ALIAS = "oradb23ai_high" DB_ALIAS = "oradb23ai_high"
USERNAME = "admin" USERNAME = "admin"
PASSWORD = "**********" PASSWORD = "Moniquinha1972"
os.environ["TNS_ADMIN"] = WALLET_PATH os.environ["TNS_ADMIN"] = WALLET_PATH
GRAPH_NAME = "GRAPH_DB_1"
# ========================= # =========================
# Global Configurations # Global Configurations
@@ -32,7 +32,7 @@ GRAPH_NAME = "GRAPH_DB_1"
INDEX_PATH = "./faiss_index" INDEX_PATH = "./faiss_index"
PROCESSED_DOCS_FILE = os.path.join(INDEX_PATH, "processed_docs.pkl") PROCESSED_DOCS_FILE = os.path.join(INDEX_PATH, "processed_docs.pkl")
chapter_separator_regex = r"^(#{1,6} .+|\*\*.+\*\*)$" chapter_separator_regex = r"^(#{1,6} .+|\*\*.+\*\*)$"
pdf_paths = ['<YOUR_KNOWLEDGE_BASE_FILE>.pdf'] GRAPH_NAME = "OCI_GRAPH"
# ========================= # =========================
# LLM Definitions # LLM Definitions
@@ -52,7 +52,6 @@ llm_for_rag = ChatOCIGenAI(
auth_profile="DEFAULT", auth_profile="DEFAULT",
) )
embeddings = OCIGenAIEmbeddings( embeddings = OCIGenAIEmbeddings(
model_id="cohere.embed-multilingual-v3.0", model_id="cohere.embed-multilingual-v3.0",
service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com", service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
@@ -70,6 +69,12 @@ oracle_conn = oracledb.connect(
) )
atexit.register(lambda: oracle_conn.close()) atexit.register(lambda: oracle_conn.close())
def filename_to_url(filename: str, suffix: str = ".pdf") -> str:
if filename.endswith(suffix):
filename = filename[: -len(suffix)]
decoded = base64.urlsafe_b64decode(filename.encode("ascii"))
return decoded.decode("utf-8")
# ========================= # =========================
# Oracle Graph Client # Oracle Graph Client
# ========================= # =========================
@@ -81,7 +86,6 @@ def ensure_oracle_text_index(
): ):
cursor = conn.cursor() cursor = conn.cursor()
# 1. Verifica se índice existe e status
cursor.execute(""" cursor.execute("""
SELECT status SELECT status
FROM user_indexes FROM user_indexes
@@ -92,7 +96,6 @@ def ensure_oracle_text_index(
index_exists = row is not None index_exists = row is not None
index_status = row[0] if row else None index_status = row[0] if row else None
# 2. Se índice não existe → cria e NÃO sincroniza agora
if not index_exists: if not index_exists:
print(f"🛠️ Creating Oracle Text index {index_name}") print(f"🛠️ Creating Oracle Text index {index_name}")
@@ -107,7 +110,6 @@ def ensure_oracle_text_index(
print(f"✅ Index {index_name} created (sync deferred)") print(f"✅ Index {index_name} created (sync deferred)")
return return
# 3. Se índice existe mas está inválido → drop + recreate
if index_status != "VALID": if index_status != "VALID":
print(f"⚠️ Index {index_name} is {index_status}. Recreating...") print(f"⚠️ Index {index_name} is {index_status}. Recreating...")
@@ -129,7 +131,6 @@ def ensure_oracle_text_index(
print(f"♻️ Index {index_name} recreated (sync deferred)") print(f"♻️ Index {index_name} recreated (sync deferred)")
return return
# 4. Índice existe e está VALID → sincroniza com proteção
print(f"🔄 Syncing Oracle Text index: {index_name}") print(f"🔄 Syncing Oracle Text index: {index_name}")
try: try:
cursor.execute(f""" cursor.execute(f"""
@@ -191,19 +192,21 @@ def create_tables_if_not_exist(conn):
create_tables_if_not_exist(oracle_conn) create_tables_if_not_exist(oracle_conn)
ensure_oracle_text_index(
oracle_conn,
"ENTITIES_" + GRAPH_NAME,
"NAME",
"IDX_ENT_" + GRAPH_NAME + "_NAME"
)
ensure_oracle_text_index( # IF GRAPH INDEX PROBLEM, Reindex
oracle_conn, # ensure_oracle_text_index(
"RELATIONS_" + GRAPH_NAME, # oracle_conn,
"RELATION_TYPE", # "ENTITIES_" + GRAPH_NAME,
"IDX_REL_" + GRAPH_NAME + "_RELTYPE" # "NAME",
) # "IDX_ENT_" + GRAPH_NAME + "_NAME"
# )
#
# ensure_oracle_text_index(
# oracle_conn,
# "RELATIONS_" + GRAPH_NAME,
# "RELATION_TYPE",
# "IDX_REL_" + GRAPH_NAME + "_RELTYPE"
# )
def create_knowledge_graph(chunks): def create_knowledge_graph(chunks):
cursor = oracle_conn.cursor() cursor = oracle_conn.cursor()
@@ -317,24 +320,145 @@ def create_knowledge_graph(chunks):
def parse_rfp_requirement(question: str) -> dict: def parse_rfp_requirement(question: str) -> dict:
prompt = f""" prompt = f"""
You are an RFP requirement extractor. You are an RFP requirement NORMALIZER for Oracle Cloud Infrastructure (OCI).
Return the result STRICTLY between the tags <json> and </json>. Your job is NOT to summarize the question.
Do NOT write anything outside these tags. Your job is to STRUCTURE the requirement so it can be searched in:
- Technical documentation
Question: - Knowledge Graph
{question} - Vector databases
<json> ────────────────────────────────
{{ STEP 1 — Understand the requirement
"requirement_type": "COMPLIANCE | FUNCTIONAL | NON_FUNCTIONAL", ────────────────────────────────
"subject": "<short subject>", From the question, identify:
"expected_value": "<value or condition if any>", 1. The PRIMARY OCI SERVICE CATEGORY involved
"decision_type": "YES_NO | YES_NO_PARTIAL", 2. The MAIN TECHNICAL SUBJECT (short and precise)
"keywords": ["keyword1", "keyword2"] 3. The EXPECTED TECHNICAL CAPABILITY or CONDITION (if any)
}}
</json> IMPORTANT:
""" - Ignore marketing language
- Ignore phrases like "possui", "permite", "oferece"
- Focus ONLY on concrete technical meaning
────────────────────────────────
STEP 2 — Mandatory service classification
────────────────────────────────
You MUST choose ONE primary technology from the list below
and INCLUDE IT EXPLICITLY in the keywords list.
Choose the MOST SPECIFIC applicable item.
☁️ OCI SERVICE CATEGORIES (MANDATORY)
🖥️ Compute (IaaS)
- compute
- compute instances
- virtual machine
- bare metal
- gpu
- hpc
- confidential computing
- autoscaling
- instance pools
- live migration
- ocvs (vmware)
- arm compute
💾 Storage
- object storage
- archive storage
- block volume
- boot volume
- file storage
- volume groups
- snapshots
- replication
🌐 Networking
- vcn
- load balancer
- network load balancer
- dns
- fastconnect
- drg
- firewall
- waf
- bastion
- vtap
- private endpoint
🔐 Security & Identity
- iam
- compartments
- policies
- oci vault
- key management
- certificates
- secrets
- cloud guard
- security zones
- vulnerability scanning
- data safe
- audit
- logging
- shielded instances
📦 Containers & Cloud Native
- oke
- kubernetes
- container registry
- api gateway
- functions
- streaming
- events
- service mesh
🗄️ Databases
- autonomous database
- adw
- atp
- base database
- exadata
- mysql
- nosql
📊 Analytics & AI
- analytics cloud
- data science
- data catalog
- big data service
- generative ai
- ai services
────────────────────────────────
STEP 3 — Keywords rules (CRITICAL)
────────────────────────────────
The "keywords" field MUST:
- ALWAYS include at least ONE OCI service keyword (e.g. "compute", "object storage", "oke")
- Include technical capability terms (e.g. resize, autoscaling, encryption)
- NEVER include generic verbs (permitir, possuir, oferecer)
- NEVER include full sentences
────────────────────────────────
STEP 4 — Output rules
────────────────────────────────
Return ONLY valid JSON between <json> tags.
Do NOT explain your reasoning.
Question:
{question}
<json>
{{
"requirement_type": "COMPLIANCE | FUNCTIONAL | NON_FUNCTIONAL",
"subject": "<short technical subject, e.g. 'Compute Instances'>",
"expected_value": "<technical capability or condition, or empty string>",
"decision_type": "YES_NO | YES_NO_PARTIAL",
"keywords": ["mandatory_oci_service", "technical_capability", "additional_term"]
}}
</json>
"""
resp = llm_for_rag.invoke(prompt) resp = llm_for_rag.invoke(prompt)
raw = resp.content.strip() raw = resp.content.strip()
@@ -498,7 +622,8 @@ def semantic_chunking(text):
2. Separate paragraphs by heading 2. Separate paragraphs by heading
3. Indicate columns with [COLUMN 1], [COLUMN 2] if present 3. Indicate columns with [COLUMN 1], [COLUMN 2] if present
4. Indicate tables with [TABLE] in markdown format 4. Indicate tables with [TABLE] in markdown format
5. Indicate explicity metrics (if it exists) 5. ALWAYS PUT THE URL if there is a Reference
6. Indicate explicity metrics (if it exists)
Examples: Examples:
- Oracle Financial Services RTO is 1 hour - Oracle Financial Services RTO is 1 hour
- The Oracle Banking Supply Chain Finance Cloud Service A maximum number of 10K Hosted Transactions - The Oracle Banking Supply Chain Finance Cloud Service A maximum number of 10K Hosted Transactions
@@ -515,7 +640,6 @@ def semantic_chunking(text):
return response return response
def read_pdfs(pdf_path): def read_pdfs(pdf_path):
if "-ocr" in pdf_path: if "-ocr" in pdf_path:
doc_pages = PyMuPDFLoader(str(pdf_path)).load() doc_pages = PyMuPDFLoader(str(pdf_path)).load()
@@ -568,7 +692,11 @@ def save_indexed_docs(docs):
# Main Function # Main Function
# ========================= # =========================
def chat(): def chat():
pdf_paths = ['RFP - Financial v2.pdf'] PDF_FOLDER = Path("docs") # pasta onde estão os PDFs
pdf_paths = sorted(
str(p) for p in PDF_FOLDER.glob("*.pdf")
)
already_indexed_docs = load_previously_indexed_docs() already_indexed_docs = load_previously_indexed_docs()
updated_docs = set() updated_docs = set()
@@ -588,6 +716,7 @@ def chat():
print(f"✅ Document already indexed: {pdf_path}") print(f"✅ Document already indexed: {pdf_path}")
continue continue
full_text = read_pdfs(pdf_path=pdf_path) full_text = read_pdfs(pdf_path=pdf_path)
path_url = filename_to_url(os.path.basename(pdf_path))
text_chunks = smart_split_text(full_text, max_chunk_size=10_000) text_chunks = smart_split_text(full_text, max_chunk_size=10_000)
overflow_buffer = "" overflow_buffer = ""
@@ -610,7 +739,9 @@ def chat():
overflow_buffer = "" overflow_buffer = ""
for chapter_text in chapters: for chapter_text in chapters:
doc = Document(page_content=chapter_text, metadata={"source": pdf_path}) reference_url = "Reference: " + path_url
chapter_text = chapter_text + "\n" + reference_url
doc = Document(page_content=chapter_text, metadata={"source": pdf_path, "reference": reference_url})
new_chunks.append(doc) new_chunks.append(doc)
print(f"✅ New chapter indexed:\n{chapter_text}...\n") print(f"✅ New chapter indexed:\n{chapter_text}...\n")
@@ -653,9 +784,14 @@ def chat():
Decision rules: Decision rules:
- Answer ONLY with YES, NO or PARTIAL - Answer ONLY with YES, NO or PARTIAL
- Do NOT assume anything not explicitly stated
- If value differs, answer PARTIAL - If value differs, answer PARTIAL
- If not found, answer NO - If not found, answer NO
Interpretation rules (MANDATORY):
- If a capability is supported but requires reboot, downtime, or restart, it STILL counts as YES unless the requirement explicitly forbids it.
- "Servidor em funcionamento" means the resource exists and is active before the operation, not that it must remain online without interruption.
- Only answer NO if the operation is NOT supported at all or requires destroying and recreating the resource.
- Reboot, restart, or brief unavailability MUST NOT be interpreted as lack of support.
Confidence rules: Confidence rules:
- HIGH: Explicit evidence directly answers the requirement - HIGH: Explicit evidence directly answers the requirement
@@ -671,7 +807,7 @@ def chat():
Service scope rules (MANDATORY): Service scope rules (MANDATORY):
- Evidence is valid ONLY if it refers to the SAME service category as the requirement. - Evidence is valid ONLY if it refers to the SAME service category as the requirement.
- Do NOT use evidence from a different Oracle Cloud service to justify another. - Do NOT use evidence from a different Oracle Cloud service to justify another.
OUTPUT CONSTRAINTS (MANDATORY): OUTPUT CONSTRAINTS (MANDATORY):
- Return ONLY a valid JSON object - Return ONLY a valid JSON object
- Do NOT include explanations, comments, markdown, lists, or code fences - Do NOT include explanations, comments, markdown, lists, or code fences
@@ -695,22 +831,6 @@ def chat():
""" """
prompt = PromptTemplate.from_template(RFP_DECISION_TEMPLATE) prompt = PromptTemplate.from_template(RFP_DECISION_TEMPLATE)
def get_context(x):
query = x.get("input") if isinstance(x, dict) else x
# 1. Recupera chunks vetoriais normalmente
docs = retriever.invoke(query)
req = parse_rfp_requirement(query)
query_terms = extract_graph_keywords_from_requirement(req)
graph_context = query_knowledge_graph(query_terms)
graph_terms = extract_terms_from_graph_text(graph_context)
reranked_chunks = rerank_documents_with_graph_terms(docs, query, graph_terms)
return "\n\n".join(reranked_chunks)
def get_context_from_requirement(req: dict): def get_context_from_requirement(req: dict):
query_terms = extract_graph_keywords_from_requirement(req) query_terms = extract_graph_keywords_from_requirement(req)
@@ -757,21 +877,6 @@ def chat():
print(response) print(response)
print("\n" + "=" * 80 + "\n") print("\n" + "=" * 80 + "\n")
def get_context(x):
query = x.get("input") if isinstance(x, dict) else x
docs = retriever.invoke(query)
req = parse_rfp_requirement(query)
query_terms = extract_graph_keywords_from_requirement(req)
graph_context = query_knowledge_graph(query_terms)
graph_terms = extract_terms_from_graph_text(graph_context)
reranked_chunks = rerank_documents_with_graph_terms(docs, query, graph_terms)
return "\n\n".join(reranked_chunks)
def get_context_from_requirement(req: dict): def get_context_from_requirement(req: dict):
query_terms = extract_graph_keywords_from_requirement(req) query_terms = extract_graph_keywords_from_requirement(req)
@@ -823,10 +928,15 @@ Graph evidence:
Decision rules: Decision rules:
- Answer ONLY with YES, NO or PARTIAL - Answer ONLY with YES, NO or PARTIAL
- Do NOT assume anything not explicitly stated
- If value differs, answer PARTIAL - If value differs, answer PARTIAL
- If not found, answer NO - If not found, answer NO
Interpretation rules (MANDATORY):
- If a capability is supported but requires reboot, downtime, or restart, it STILL counts as YES unless the requirement explicitly forbids it.
- "Servidor em funcionamento" means the resource exists and is active before the operation, not that it must remain online without interruption.
- Only answer NO if the operation is NOT supported at all or requires destroying and recreating the resource.
- Reboot, restart, or brief unavailability MUST NOT be interpreted as lack of support.
Confidence rules: Confidence rules:
- HIGH: Explicit evidence directly answers the requirement - HIGH: Explicit evidence directly answers the requirement
- MEDIUM: Evidence partially matches or requires light interpretation - MEDIUM: Evidence partially matches or requires light interpretation
@@ -841,6 +951,10 @@ Ambiguity rules:
Service scope rules (MANDATORY): Service scope rules (MANDATORY):
- Evidence is valid ONLY if it refers to the SAME service category as the requirement. - Evidence is valid ONLY if it refers to the SAME service category as the requirement.
- Do NOT use evidence from a different Oracle Cloud service to justify another. - Do NOT use evidence from a different Oracle Cloud service to justify another.
- PaaS services (e.g. Autonomous Database) MUST NOT be used as evidence for IaaS/Compute requirements.
- If the requirement is under Compute/IaaS, evidence MUST explicitly mention Compute, IaaS, VM, Bare Metal, or equivalent infrastructure services.
- Cross-service inference (vendor-level capability applied to another service) is strictly forbidden.
- Get all URL references or sources as evidences
OUTPUT CONSTRAINTS (MANDATORY): OUTPUT CONSTRAINTS (MANDATORY):
- Return ONLY a valid JSON object - Return ONLY a valid JSON object