This commit is contained in:
2026-01-14 09:44:22 -03:00
parent 1dce8e20d3
commit 51cf883150
2 changed files with 191 additions and 77 deletions

BIN
.DS_Store vendored

Binary file not shown.

View File

@@ -7,7 +7,7 @@ from langchain.schema.runnable import RunnableMap
from langchain_community.document_loaders import UnstructuredPDFLoader, PyMuPDFLoader
from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda
from pathlib import Path
from tqdm import tqdm
import os
import pickle
@@ -15,6 +15,7 @@ import re
import atexit
import oracledb
import json
import base64
# =========================
# Oracle Autonomous Configuration
@@ -22,9 +23,8 @@ import json
WALLET_PATH = "Wallet_oradb23ai"
DB_ALIAS = "oradb23ai_high"
USERNAME = "admin"
PASSWORD = "**********"
PASSWORD = "Moniquinha1972"
os.environ["TNS_ADMIN"] = WALLET_PATH
GRAPH_NAME = "GRAPH_DB_1"
# =========================
# Global Configurations
@@ -32,7 +32,7 @@ GRAPH_NAME = "GRAPH_DB_1"
INDEX_PATH = "./faiss_index"
PROCESSED_DOCS_FILE = os.path.join(INDEX_PATH, "processed_docs.pkl")
chapter_separator_regex = r"^(#{1,6} .+|\*\*.+\*\*)$"
pdf_paths = ['<YOUR_KNOWLEDGE_BASE_FILE>.pdf']
GRAPH_NAME = "OCI_GRAPH"
# =========================
# LLM Definitions
@@ -52,7 +52,6 @@ llm_for_rag = ChatOCIGenAI(
auth_profile="DEFAULT",
)
embeddings = OCIGenAIEmbeddings(
model_id="cohere.embed-multilingual-v3.0",
service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
@@ -70,6 +69,12 @@ oracle_conn = oracledb.connect(
)
atexit.register(lambda: oracle_conn.close())
def filename_to_url(filename: str, suffix: str = ".pdf") -> str:
if filename.endswith(suffix):
filename = filename[: -len(suffix)]
decoded = base64.urlsafe_b64decode(filename.encode("ascii"))
return decoded.decode("utf-8")
# =========================
# Oracle Graph Client
# =========================
@@ -81,7 +86,6 @@ def ensure_oracle_text_index(
):
cursor = conn.cursor()
# 1. Verifica se índice existe e status
cursor.execute("""
SELECT status
FROM user_indexes
@@ -92,7 +96,6 @@ def ensure_oracle_text_index(
index_exists = row is not None
index_status = row[0] if row else None
# 2. Se índice não existe → cria e NÃO sincroniza agora
if not index_exists:
print(f"🛠️ Creating Oracle Text index {index_name}")
@@ -107,7 +110,6 @@ def ensure_oracle_text_index(
print(f"✅ Index {index_name} created (sync deferred)")
return
# 3. Se índice existe mas está inválido → drop + recreate
if index_status != "VALID":
print(f"⚠️ Index {index_name} is {index_status}. Recreating...")
@@ -129,7 +131,6 @@ def ensure_oracle_text_index(
print(f"♻️ Index {index_name} recreated (sync deferred)")
return
# 4. Índice existe e está VALID → sincroniza com proteção
print(f"🔄 Syncing Oracle Text index: {index_name}")
try:
cursor.execute(f"""
@@ -191,19 +192,21 @@ def create_tables_if_not_exist(conn):
create_tables_if_not_exist(oracle_conn)
ensure_oracle_text_index(
oracle_conn,
"ENTITIES_" + GRAPH_NAME,
"NAME",
"IDX_ENT_" + GRAPH_NAME + "_NAME"
)
ensure_oracle_text_index(
oracle_conn,
"RELATIONS_" + GRAPH_NAME,
"RELATION_TYPE",
"IDX_REL_" + GRAPH_NAME + "_RELTYPE"
)
# IF GRAPH INDEX PROBLEM, Reindex
# ensure_oracle_text_index(
# oracle_conn,
# "ENTITIES_" + GRAPH_NAME,
# "NAME",
# "IDX_ENT_" + GRAPH_NAME + "_NAME"
# )
#
# ensure_oracle_text_index(
# oracle_conn,
# "RELATIONS_" + GRAPH_NAME,
# "RELATION_TYPE",
# "IDX_REL_" + GRAPH_NAME + "_RELTYPE"
# )
def create_knowledge_graph(chunks):
cursor = oracle_conn.cursor()
@@ -317,24 +320,145 @@ def create_knowledge_graph(chunks):
def parse_rfp_requirement(question: str) -> dict:
prompt = f"""
You are an RFP requirement extractor.
Return the result STRICTLY between the tags <json> and </json>.
Do NOT write anything outside these tags.
Question:
{question}
<json>
{{
"requirement_type": "COMPLIANCE | FUNCTIONAL | NON_FUNCTIONAL",
"subject": "<short subject>",
"expected_value": "<value or condition if any>",
"decision_type": "YES_NO | YES_NO_PARTIAL",
"keywords": ["keyword1", "keyword2"]
}}
</json>
"""
You are an RFP requirement NORMALIZER for Oracle Cloud Infrastructure (OCI).
Your job is NOT to summarize the question.
Your job is to STRUCTURE the requirement so it can be searched in:
- Technical documentation
- Knowledge Graph
- Vector databases
────────────────────────────────
STEP 1 — Understand the requirement
────────────────────────────────
From the question, identify:
1. The PRIMARY OCI SERVICE CATEGORY involved
2. The MAIN TECHNICAL SUBJECT (short and precise)
3. The EXPECTED TECHNICAL CAPABILITY or CONDITION (if any)
IMPORTANT:
- Ignore marketing language
- Ignore phrases like "possui", "permite", "oferece"
- Focus ONLY on concrete technical meaning
────────────────────────────────
STEP 2 — Mandatory service classification
────────────────────────────────
You MUST choose ONE primary technology from the list below
and INCLUDE IT EXPLICITLY in the keywords list.
Choose the MOST SPECIFIC applicable item.
☁️ OCI SERVICE CATEGORIES (MANDATORY)
🖥️ Compute (IaaS)
- compute
- compute instances
- virtual machine
- bare metal
- gpu
- hpc
- confidential computing
- autoscaling
- instance pools
- live migration
- ocvs (vmware)
- arm compute
💾 Storage
- object storage
- archive storage
- block volume
- boot volume
- file storage
- volume groups
- snapshots
- replication
🌐 Networking
- vcn
- load balancer
- network load balancer
- dns
- fastconnect
- drg
- firewall
- waf
- bastion
- vtap
- private endpoint
🔐 Security & Identity
- iam
- compartments
- policies
- oci vault
- key management
- certificates
- secrets
- cloud guard
- security zones
- vulnerability scanning
- data safe
- audit
- logging
- shielded instances
📦 Containers & Cloud Native
- oke
- kubernetes
- container registry
- api gateway
- functions
- streaming
- events
- service mesh
🗄️ Databases
- autonomous database
- adw
- atp
- base database
- exadata
- mysql
- nosql
📊 Analytics & AI
- analytics cloud
- data science
- data catalog
- big data service
- generative ai
- ai services
────────────────────────────────
STEP 3 — Keywords rules (CRITICAL)
────────────────────────────────
The "keywords" field MUST:
- ALWAYS include at least ONE OCI service keyword (e.g. "compute", "object storage", "oke")
- Include technical capability terms (e.g. resize, autoscaling, encryption)
- NEVER include generic verbs (permitir, possuir, oferecer)
- NEVER include full sentences
────────────────────────────────
STEP 4 — Output rules
────────────────────────────────
Return ONLY valid JSON between <json> tags.
Do NOT explain your reasoning.
Question:
{question}
<json>
{{
"requirement_type": "COMPLIANCE | FUNCTIONAL | NON_FUNCTIONAL",
"subject": "<short technical subject, e.g. 'Compute Instances'>",
"expected_value": "<technical capability or condition, or empty string>",
"decision_type": "YES_NO | YES_NO_PARTIAL",
"keywords": ["mandatory_oci_service", "technical_capability", "additional_term"]
}}
</json>
"""
resp = llm_for_rag.invoke(prompt)
raw = resp.content.strip()
@@ -498,7 +622,8 @@ def semantic_chunking(text):
2. Separate paragraphs by heading
3. Indicate columns with [COLUMN 1], [COLUMN 2] if present
4. Indicate tables with [TABLE] in markdown format
5. Indicate explicity metrics (if it exists)
5. ALWAYS PUT THE URL if there is a Reference
6. Indicate explicity metrics (if it exists)
Examples:
- Oracle Financial Services RTO is 1 hour
- The Oracle Banking Supply Chain Finance Cloud Service A maximum number of 10K Hosted Transactions
@@ -515,7 +640,6 @@ def semantic_chunking(text):
return response
def read_pdfs(pdf_path):
if "-ocr" in pdf_path:
doc_pages = PyMuPDFLoader(str(pdf_path)).load()
@@ -568,7 +692,11 @@ def save_indexed_docs(docs):
# Main Function
# =========================
def chat():
pdf_paths = ['RFP - Financial v2.pdf']
PDF_FOLDER = Path("docs") # pasta onde estão os PDFs
pdf_paths = sorted(
str(p) for p in PDF_FOLDER.glob("*.pdf")
)
already_indexed_docs = load_previously_indexed_docs()
updated_docs = set()
@@ -588,6 +716,7 @@ def chat():
print(f"✅ Document already indexed: {pdf_path}")
continue
full_text = read_pdfs(pdf_path=pdf_path)
path_url = filename_to_url(os.path.basename(pdf_path))
text_chunks = smart_split_text(full_text, max_chunk_size=10_000)
overflow_buffer = ""
@@ -610,7 +739,9 @@ def chat():
overflow_buffer = ""
for chapter_text in chapters:
doc = Document(page_content=chapter_text, metadata={"source": pdf_path})
reference_url = "Reference: " + path_url
chapter_text = chapter_text + "\n" + reference_url
doc = Document(page_content=chapter_text, metadata={"source": pdf_path, "reference": reference_url})
new_chunks.append(doc)
print(f"✅ New chapter indexed:\n{chapter_text}...\n")
@@ -653,9 +784,14 @@ def chat():
Decision rules:
- Answer ONLY with YES, NO or PARTIAL
- Do NOT assume anything not explicitly stated
- If value differs, answer PARTIAL
- If not found, answer NO
Interpretation rules (MANDATORY):
- If a capability is supported but requires reboot, downtime, or restart, it STILL counts as YES unless the requirement explicitly forbids it.
- "Servidor em funcionamento" means the resource exists and is active before the operation, not that it must remain online without interruption.
- Only answer NO if the operation is NOT supported at all or requires destroying and recreating the resource.
- Reboot, restart, or brief unavailability MUST NOT be interpreted as lack of support.
Confidence rules:
- HIGH: Explicit evidence directly answers the requirement
@@ -671,7 +807,7 @@ def chat():
Service scope rules (MANDATORY):
- Evidence is valid ONLY if it refers to the SAME service category as the requirement.
- Do NOT use evidence from a different Oracle Cloud service to justify another.
OUTPUT CONSTRAINTS (MANDATORY):
- Return ONLY a valid JSON object
- Do NOT include explanations, comments, markdown, lists, or code fences
@@ -695,22 +831,6 @@ def chat():
"""
prompt = PromptTemplate.from_template(RFP_DECISION_TEMPLATE)
def get_context(x):
query = x.get("input") if isinstance(x, dict) else x
# 1. Recupera chunks vetoriais normalmente
docs = retriever.invoke(query)
req = parse_rfp_requirement(query)
query_terms = extract_graph_keywords_from_requirement(req)
graph_context = query_knowledge_graph(query_terms)
graph_terms = extract_terms_from_graph_text(graph_context)
reranked_chunks = rerank_documents_with_graph_terms(docs, query, graph_terms)
return "\n\n".join(reranked_chunks)
def get_context_from_requirement(req: dict):
query_terms = extract_graph_keywords_from_requirement(req)
@@ -757,21 +877,6 @@ def chat():
print(response)
print("\n" + "=" * 80 + "\n")
def get_context(x):
query = x.get("input") if isinstance(x, dict) else x
docs = retriever.invoke(query)
req = parse_rfp_requirement(query)
query_terms = extract_graph_keywords_from_requirement(req)
graph_context = query_knowledge_graph(query_terms)
graph_terms = extract_terms_from_graph_text(graph_context)
reranked_chunks = rerank_documents_with_graph_terms(docs, query, graph_terms)
return "\n\n".join(reranked_chunks)
def get_context_from_requirement(req: dict):
query_terms = extract_graph_keywords_from_requirement(req)
@@ -823,10 +928,15 @@ Graph evidence:
Decision rules:
- Answer ONLY with YES, NO or PARTIAL
- Do NOT assume anything not explicitly stated
- If value differs, answer PARTIAL
- If not found, answer NO
Interpretation rules (MANDATORY):
- If a capability is supported but requires reboot, downtime, or restart, it STILL counts as YES unless the requirement explicitly forbids it.
- "Servidor em funcionamento" means the resource exists and is active before the operation, not that it must remain online without interruption.
- Only answer NO if the operation is NOT supported at all or requires destroying and recreating the resource.
- Reboot, restart, or brief unavailability MUST NOT be interpreted as lack of support.
Confidence rules:
- HIGH: Explicit evidence directly answers the requirement
- MEDIUM: Evidence partially matches or requires light interpretation
@@ -841,6 +951,10 @@ Ambiguity rules:
Service scope rules (MANDATORY):
- Evidence is valid ONLY if it refers to the SAME service category as the requirement.
- Do NOT use evidence from a different Oracle Cloud service to justify another.
- PaaS services (e.g. Autonomous Database) MUST NOT be used as evidence for IaaS/Compute requirements.
- If the requirement is under Compute/IaaS, evidence MUST explicitly mention Compute, IaaS, VM, Bare Metal, or equivalent infrastructure services.
- Cross-service inference (vendor-level capability applied to another service) is strictly forbidden.
- Get all URL references or sources as evidences
OUTPUT CONSTRAINTS (MANDATORY):
- Return ONLY a valid JSON object