diff --git a/.DS_Store b/.DS_Store index 154b1de..d9a783d 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/files/graphrag_rerank.py b/files/graphrag_rerank.py index 9a161fc..2680a36 100644 --- a/files/graphrag_rerank.py +++ b/files/graphrag_rerank.py @@ -7,7 +7,7 @@ from langchain.schema.runnable import RunnableMap from langchain_community.document_loaders import UnstructuredPDFLoader, PyMuPDFLoader from langchain_core.documents import Document from langchain_core.runnables import RunnableLambda - +from pathlib import Path from tqdm import tqdm import os import pickle @@ -15,6 +15,7 @@ import re import atexit import oracledb import json +import base64 # ========================= # Oracle Autonomous Configuration @@ -22,9 +23,8 @@ import json WALLET_PATH = "Wallet_oradb23ai" DB_ALIAS = "oradb23ai_high" USERNAME = "admin" -PASSWORD = "**********" +PASSWORD = "Moniquinha1972" os.environ["TNS_ADMIN"] = WALLET_PATH -GRAPH_NAME = "GRAPH_DB_1" # ========================= # Global Configurations @@ -32,7 +32,7 @@ GRAPH_NAME = "GRAPH_DB_1" INDEX_PATH = "./faiss_index" PROCESSED_DOCS_FILE = os.path.join(INDEX_PATH, "processed_docs.pkl") chapter_separator_regex = r"^(#{1,6} .+|\*\*.+\*\*)$" -pdf_paths = ['.pdf'] +GRAPH_NAME = "OCI_GRAPH" # ========================= # LLM Definitions @@ -52,7 +52,6 @@ llm_for_rag = ChatOCIGenAI( auth_profile="DEFAULT", ) - embeddings = OCIGenAIEmbeddings( model_id="cohere.embed-multilingual-v3.0", service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com", @@ -70,6 +69,12 @@ oracle_conn = oracledb.connect( ) atexit.register(lambda: oracle_conn.close()) +def filename_to_url(filename: str, suffix: str = ".pdf") -> str: + if filename.endswith(suffix): + filename = filename[: -len(suffix)] + decoded = base64.urlsafe_b64decode(filename.encode("ascii")) + return decoded.decode("utf-8") + # ========================= # Oracle Graph Client # ========================= @@ -81,7 +86,6 @@ def ensure_oracle_text_index( ): cursor = conn.cursor() - # 1. Verifica se índice existe e status cursor.execute(""" SELECT status FROM user_indexes @@ -92,7 +96,6 @@ def ensure_oracle_text_index( index_exists = row is not None index_status = row[0] if row else None - # 2. Se índice não existe → cria e NÃO sincroniza agora if not index_exists: print(f"🛠️ Creating Oracle Text index {index_name}") @@ -107,7 +110,6 @@ def ensure_oracle_text_index( print(f"✅ Index {index_name} created (sync deferred)") return - # 3. Se índice existe mas está inválido → drop + recreate if index_status != "VALID": print(f"⚠️ Index {index_name} is {index_status}. Recreating...") @@ -129,7 +131,6 @@ def ensure_oracle_text_index( print(f"♻️ Index {index_name} recreated (sync deferred)") return - # 4. Índice existe e está VALID → sincroniza com proteção print(f"🔄 Syncing Oracle Text index: {index_name}") try: cursor.execute(f""" @@ -191,19 +192,21 @@ def create_tables_if_not_exist(conn): create_tables_if_not_exist(oracle_conn) -ensure_oracle_text_index( - oracle_conn, - "ENTITIES_" + GRAPH_NAME, - "NAME", - "IDX_ENT_" + GRAPH_NAME + "_NAME" -) -ensure_oracle_text_index( - oracle_conn, - "RELATIONS_" + GRAPH_NAME, - "RELATION_TYPE", - "IDX_REL_" + GRAPH_NAME + "_RELTYPE" -) +# IF GRAPH INDEX PROBLEM, Reindex +# ensure_oracle_text_index( +# oracle_conn, +# "ENTITIES_" + GRAPH_NAME, +# "NAME", +# "IDX_ENT_" + GRAPH_NAME + "_NAME" +# ) +# +# ensure_oracle_text_index( +# oracle_conn, +# "RELATIONS_" + GRAPH_NAME, +# "RELATION_TYPE", +# "IDX_REL_" + GRAPH_NAME + "_RELTYPE" +# ) def create_knowledge_graph(chunks): cursor = oracle_conn.cursor() @@ -317,24 +320,145 @@ def create_knowledge_graph(chunks): def parse_rfp_requirement(question: str) -> dict: prompt = f""" -You are an RFP requirement extractor. - -Return the result STRICTLY between the tags and . -Do NOT write anything outside these tags. - -Question: -{question} - - -{{ - "requirement_type": "COMPLIANCE | FUNCTIONAL | NON_FUNCTIONAL", - "subject": "", - "expected_value": "", - "decision_type": "YES_NO | YES_NO_PARTIAL", - "keywords": ["keyword1", "keyword2"] -}} - -""" + You are an RFP requirement NORMALIZER for Oracle Cloud Infrastructure (OCI). + + Your job is NOT to summarize the question. + Your job is to STRUCTURE the requirement so it can be searched in: + - Technical documentation + - Knowledge Graph + - Vector databases + + ──────────────────────────────── + STEP 1 — Understand the requirement + ──────────────────────────────── + From the question, identify: + 1. The PRIMARY OCI SERVICE CATEGORY involved + 2. The MAIN TECHNICAL SUBJECT (short and precise) + 3. The EXPECTED TECHNICAL CAPABILITY or CONDITION (if any) + + IMPORTANT: + - Ignore marketing language + - Ignore phrases like "possui", "permite", "oferece" + - Focus ONLY on concrete technical meaning + + ──────────────────────────────── + STEP 2 — Mandatory service classification + ──────────────────────────────── + You MUST choose ONE primary technology from the list below + and INCLUDE IT EXPLICITLY in the keywords list. + + Choose the MOST SPECIFIC applicable item. + + ☁️ OCI SERVICE CATEGORIES (MANDATORY) + + 🖥️ Compute (IaaS) + - compute + - compute instances + - virtual machine + - bare metal + - gpu + - hpc + - confidential computing + - autoscaling + - instance pools + - live migration + - ocvs (vmware) + - arm compute + + 💾 Storage + - object storage + - archive storage + - block volume + - boot volume + - file storage + - volume groups + - snapshots + - replication + + 🌐 Networking + - vcn + - load balancer + - network load balancer + - dns + - fastconnect + - drg + - firewall + - waf + - bastion + - vtap + - private endpoint + + 🔐 Security & Identity + - iam + - compartments + - policies + - oci vault + - key management + - certificates + - secrets + - cloud guard + - security zones + - vulnerability scanning + - data safe + - audit + - logging + - shielded instances + + 📦 Containers & Cloud Native + - oke + - kubernetes + - container registry + - api gateway + - functions + - streaming + - events + - service mesh + + 🗄️ Databases + - autonomous database + - adw + - atp + - base database + - exadata + - mysql + - nosql + + 📊 Analytics & AI + - analytics cloud + - data science + - data catalog + - big data service + - generative ai + - ai services + + ──────────────────────────────── + STEP 3 — Keywords rules (CRITICAL) + ──────────────────────────────── + The "keywords" field MUST: + - ALWAYS include at least ONE OCI service keyword (e.g. "compute", "object storage", "oke") + - Include technical capability terms (e.g. resize, autoscaling, encryption) + - NEVER include generic verbs (permitir, possuir, oferecer) + - NEVER include full sentences + + ──────────────────────────────── + STEP 4 — Output rules + ──────────────────────────────── + Return ONLY valid JSON between tags. + Do NOT explain your reasoning. + + Question: + {question} + + + {{ + "requirement_type": "COMPLIANCE | FUNCTIONAL | NON_FUNCTIONAL", + "subject": "", + "expected_value": "", + "decision_type": "YES_NO | YES_NO_PARTIAL", + "keywords": ["mandatory_oci_service", "technical_capability", "additional_term"] + }} + + """ resp = llm_for_rag.invoke(prompt) raw = resp.content.strip() @@ -498,7 +622,8 @@ def semantic_chunking(text): 2. Separate paragraphs by heading 3. Indicate columns with [COLUMN 1], [COLUMN 2] if present 4. Indicate tables with [TABLE] in markdown format - 5. Indicate explicity metrics (if it exists) + 5. ALWAYS PUT THE URL if there is a Reference + 6. Indicate explicity metrics (if it exists) Examples: - Oracle Financial Services RTO is 1 hour - The Oracle Banking Supply Chain Finance Cloud Service A maximum number of 10K Hosted Transactions @@ -515,7 +640,6 @@ def semantic_chunking(text): return response - def read_pdfs(pdf_path): if "-ocr" in pdf_path: doc_pages = PyMuPDFLoader(str(pdf_path)).load() @@ -568,7 +692,11 @@ def save_indexed_docs(docs): # Main Function # ========================= def chat(): - pdf_paths = ['RFP - Financial v2.pdf'] + PDF_FOLDER = Path("docs") # pasta onde estão os PDFs + + pdf_paths = sorted( + str(p) for p in PDF_FOLDER.glob("*.pdf") + ) already_indexed_docs = load_previously_indexed_docs() updated_docs = set() @@ -588,6 +716,7 @@ def chat(): print(f"✅ Document already indexed: {pdf_path}") continue full_text = read_pdfs(pdf_path=pdf_path) + path_url = filename_to_url(os.path.basename(pdf_path)) text_chunks = smart_split_text(full_text, max_chunk_size=10_000) overflow_buffer = "" @@ -610,7 +739,9 @@ def chat(): overflow_buffer = "" for chapter_text in chapters: - doc = Document(page_content=chapter_text, metadata={"source": pdf_path}) + reference_url = "Reference: " + path_url + chapter_text = chapter_text + "\n" + reference_url + doc = Document(page_content=chapter_text, metadata={"source": pdf_path, "reference": reference_url}) new_chunks.append(doc) print(f"✅ New chapter indexed:\n{chapter_text}...\n") @@ -653,9 +784,14 @@ def chat(): Decision rules: - Answer ONLY with YES, NO or PARTIAL - - Do NOT assume anything not explicitly stated - If value differs, answer PARTIAL - If not found, answer NO + + Interpretation rules (MANDATORY): + - If a capability is supported but requires reboot, downtime, or restart, it STILL counts as YES unless the requirement explicitly forbids it. + - "Servidor em funcionamento" means the resource exists and is active before the operation, not that it must remain online without interruption. + - Only answer NO if the operation is NOT supported at all or requires destroying and recreating the resource. + - Reboot, restart, or brief unavailability MUST NOT be interpreted as lack of support. Confidence rules: - HIGH: Explicit evidence directly answers the requirement @@ -671,7 +807,7 @@ def chat(): Service scope rules (MANDATORY): - Evidence is valid ONLY if it refers to the SAME service category as the requirement. - Do NOT use evidence from a different Oracle Cloud service to justify another. - + OUTPUT CONSTRAINTS (MANDATORY): - Return ONLY a valid JSON object - Do NOT include explanations, comments, markdown, lists, or code fences @@ -695,22 +831,6 @@ def chat(): """ prompt = PromptTemplate.from_template(RFP_DECISION_TEMPLATE) - def get_context(x): - query = x.get("input") if isinstance(x, dict) else x - - # 1. Recupera chunks vetoriais normalmente - docs = retriever.invoke(query) - - req = parse_rfp_requirement(query) - query_terms = extract_graph_keywords_from_requirement(req) - graph_context = query_knowledge_graph(query_terms) - - graph_terms = extract_terms_from_graph_text(graph_context) - - reranked_chunks = rerank_documents_with_graph_terms(docs, query, graph_terms) - - return "\n\n".join(reranked_chunks) - def get_context_from_requirement(req: dict): query_terms = extract_graph_keywords_from_requirement(req) @@ -757,21 +877,6 @@ def chat(): print(response) print("\n" + "=" * 80 + "\n") -def get_context(x): - query = x.get("input") if isinstance(x, dict) else x - - docs = retriever.invoke(query) - - req = parse_rfp_requirement(query) - query_terms = extract_graph_keywords_from_requirement(req) - graph_context = query_knowledge_graph(query_terms) - - graph_terms = extract_terms_from_graph_text(graph_context) - - reranked_chunks = rerank_documents_with_graph_terms(docs, query, graph_terms) - - return "\n\n".join(reranked_chunks) - def get_context_from_requirement(req: dict): query_terms = extract_graph_keywords_from_requirement(req) @@ -823,10 +928,15 @@ Graph evidence: Decision rules: - Answer ONLY with YES, NO or PARTIAL -- Do NOT assume anything not explicitly stated - If value differs, answer PARTIAL - If not found, answer NO +Interpretation rules (MANDATORY): +- If a capability is supported but requires reboot, downtime, or restart, it STILL counts as YES unless the requirement explicitly forbids it. +- "Servidor em funcionamento" means the resource exists and is active before the operation, not that it must remain online without interruption. +- Only answer NO if the operation is NOT supported at all or requires destroying and recreating the resource. +- Reboot, restart, or brief unavailability MUST NOT be interpreted as lack of support. + Confidence rules: - HIGH: Explicit evidence directly answers the requirement - MEDIUM: Evidence partially matches or requires light interpretation @@ -841,6 +951,10 @@ Ambiguity rules: Service scope rules (MANDATORY): - Evidence is valid ONLY if it refers to the SAME service category as the requirement. - Do NOT use evidence from a different Oracle Cloud service to justify another. +- PaaS services (e.g. Autonomous Database) MUST NOT be used as evidence for IaaS/Compute requirements. +- If the requirement is under Compute/IaaS, evidence MUST explicitly mention Compute, IaaS, VM, Bare Metal, or equivalent infrastructure services. +- Cross-service inference (vendor-level capability applied to another service) is strictly forbidden. +- Get all URL references or sources as evidences OUTPUT CONSTRAINTS (MANDATORY): - Return ONLY a valid JSON object