From 51cf883150699cab773738d1d9cb1eee5c86927a Mon Sep 17 00:00:00 2001
From: Cristiano Hoshikawa <hoshikawa@uol.com.br>
Date: Wed, 14 Jan 2026 09:44:22 -0300
Subject: [PATCH] adjust

---
 .DS_Store                | Bin 6148 -> 10244 bytes
 files/graphrag_rerank.py | 268 ++++++++++++++++++++++++++++-----------
 2 files changed, 191 insertions(+), 77 deletions(-)
diff --git a/.DS_Store b/.DS_Store
index 154b1dea0b56a0bebf521d7761b0544679974fda..d9a783d85bc43d45877424a41066d6984c7beb97 100644
GIT binary patch
literal 10244
zcmeHMYitx%6uxKL!i;6?6pFBY$doEpE%pJlJPdAMfl?4;`vxhqJ3~8RI#YLMcPq6?
z4L&d!VvG;eAEN$6e8t39B7zYkAtGsrN+KFGni%|{sEI#}=iWP8b_-R18O6ED+%xw)
z?%DIrId|sXC4@jzLa!x6C4`80QL4<s?lO(a>2*mHd`=NkKzm{h)J9^?(C{Ui0V6;l
zKp;RMKp;RMK;Ykk0KT(n6Z1I@`T&6dfdGL?1nByp;zeoLr{kQmr2{**1wfjMY}W8z
zp*KiXpN4%p&M8W;0XId;O~Kz{05?az>g$DlI?gFKC*W^B!1oOP4h5KZoL^<?1j3vK
zeSkoKz@-S#y?ZXXg$xjrcv<`R(uOxOWkXI&OVo0r$P;&ueOI=B@3F>gx&Ei};lYfv
zf%21RO7hUJNpxZmEz2{hK9McW^nGOcV7c!*!+yxQ=)LCXYo)*LVZSazBg;QIK>gFW
z9TJ=BJ!eg(EYo*;dC|!m`OK5HnVv5v&-Tk!r#!L)<F+tvH=1&XeN^9!4qK+{)zq8@
zAtyI)#!PJaGw+NHI;n`Kc?q{ki*KQ$4#%;g>3c2OWg4k|rKs7lJ<YU@I5RMH168(m
zo3`$xnw*5KyUaGi1roM?B{Vu(S65r5*3~T?t5Qd6YnRtosSQh)jEza++=}W|oBMZK
zL$<S*VudTJzw#29k0<+w@h&5z>_<6%Zi>`RHuHdUdeQ-(@dFm6tMX^bSIfP9y<4`*
z+j`8HV@J$gM!!<vV%%*w+!QyG7;ksXu@XbJ+piSHG%J>{G|y<XEUN7GX(JuRh?h2u
z#U0Dq<hZ71I(Bc5H=U@b&voKnGn=<xQ4+4P*}!Sdq|t(tqo>)_@y1~~joW3!U8EJ^
z8jgZ=s9=8C!bLaNEL(Y7%i2vH6H4LiIYo12xexKSJ#&X?#5+@_?hW>8Zp^TC(;nzf
znywkOj1F5HG74d>Bu&gIoUMe4L#3ipA@(Gq=3NQXyPI+41)*|Dd0pJ9<B!?k=yXP%
z>n{jhFN#|zzd3GT&)ITi=q5?g#10oR;ZrP@t0l#iI(A}GaML<@si^2ur=>*=i?bT!
z6{6BEbtid*oO!FfS}cSrsRs}DhA<wNvl~OrqVl1*jrv}%x!%TM!@l^d)`(&+l{D%)
zGL^P<?dvy$Lertp=MHLdV`B;vSUDw%O#cqlXU`rbbpBS|^qjaa>|{d4w38-Q2R;m5
zB%&x)ge&<|so;c=io?VFvw&eODfDAw0jVS_NfYTJ8ga=zWFL8o93=0NkH~TIIXOpu
zAQ#B*Faz@8YA65&LQo8)PzH;j8fu^pmOvQVU>&qWH*5zDVxWTwad064Nf?Fu;C^@z
z_P{=P686K>@C>{F2jN9{8D4?6;3&KgAHXp<4QJs?_zJ#<3-FteE6ft+2_d0aC=sp`
z>V*bjxzH$d2%SQg&@F5iOd&3$gb^N=kgmu6beKMW7t+jnbd&xPBfK<S#kH+#YhN$d
z|CcK6vjSdY*UXz=P780v>gLJx<O!Y5;mq1#<&afGtQ=TgOwHp^aY<>}wX)iY^<s=n
zm$=GIS{dR5)m~U3t4bbL6d&GDiNhIKO?+6jSXC7Pi<l2<>r`3E!4j32ZT(^#f<IWf
zN|u#eELLfvw74WBhvg=TE)fc0x-G3Z6D_^Jk?enw>R*s=$$9cK0H*pJn2!lw1<PO+
zG-6^$pc8ta7y2-{4H$qymf#+ycM3+}Uf7K({s1QVBk(9Z29LuN@GLwB&$C27#1j2g
zcpKh<ci}J`f#YxjK8BO<37mp6@HL!=AEzYp?b(S8KV>I!=9~EJiq?|JuYk$7Y394Q
z`<qWU|MHOk6)>rh{Wd;89cGkDG7`5f#~wg#y11xD&;kSk1OfyC1Of!EBm{DJFe*L&
zZ=L@C|0^lvAeI1u0D&n8KyFK<r3snSee}|^b``Jfc+p1n#yMps*s<eyr0h5z2lP0e
qZNri0ACfS;F!HJWIH!1r?VtZMAlU!2BgXWJTd@DfGr0fP{{IhN=PE$}

delta 319
zcmZn(XfcprU|?W$DortDU=RQ@Ie-{MGjUE#6q~50$SANeU^gS9z+@f)$$Dw)(?BT}
z20ey!hD?T%+<X_8q@4UDkhX>ESFZooaoiDAJ_WCQL55*)a(-?BP!9ux+yfBF3N$;D
zA%!88Au%T%NtQu<5(5K2vNX{0$yWvBCtniSF*#O<PpQ-vtUV2=JqM`07};?me_k*C
z)-m~ngbW*$`AR~Lal*z#9+t)I92|noAgh7?<^~e3Air-+{LVa?U&Rp=I1G#+F9QJ+
OBy_-Ho8x)rFarRHs!0t1

diff --git a/files/graphrag_rerank.py b/files/graphrag_rerank.py
index 9a161fc..2680a36 100644
--- a/files/graphrag_rerank.py
+++ b/files/graphrag_rerank.py
@@ -7,7 +7,7 @@ from langchain.schema.runnable import RunnableMap
 from langchain_community.document_loaders import UnstructuredPDFLoader, PyMuPDFLoader
 from langchain_core.documents import Document
 from langchain_core.runnables import RunnableLambda
-
+from pathlib import Path
 from tqdm import tqdm
 import os
 import pickle
@@ -15,6 +15,7 @@ import re
 import atexit
 import oracledb
 import json
+import base64
 
 # =========================
 # Oracle Autonomous Configuration
@@ -22,9 +23,8 @@ import json
 WALLET_PATH = "Wallet_oradb23ai"
 DB_ALIAS = "oradb23ai_high"
 USERNAME = "admin"
-PASSWORD = "**********"
+PASSWORD = "Moniquinha1972"
 os.environ["TNS_ADMIN"] = WALLET_PATH
-GRAPH_NAME = "GRAPH_DB_1"
 
 # =========================
 # Global Configurations
@@ -32,7 +32,7 @@ GRAPH_NAME = "GRAPH_DB_1"
 INDEX_PATH = "./faiss_index"
 PROCESSED_DOCS_FILE = os.path.join(INDEX_PATH, "processed_docs.pkl")
 chapter_separator_regex = r"^(#{1,6} .+|\*\*.+\*\*)$"
-pdf_paths = ['<YOUR_KNOWLEDGE_BASE_FILE>.pdf']
+GRAPH_NAME = "OCI_GRAPH"
 
 # =========================
 # LLM Definitions
@@ -52,7 +52,6 @@ llm_for_rag = ChatOCIGenAI(
     auth_profile="DEFAULT",
 )
 
-
 embeddings = OCIGenAIEmbeddings(
     model_id="cohere.embed-multilingual-v3.0",
     service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
@@ -70,6 +69,12 @@ oracle_conn = oracledb.connect(
 )
 atexit.register(lambda: oracle_conn.close())
 
+def filename_to_url(filename: str, suffix: str = ".pdf") -> str:
+    if filename.endswith(suffix):
+        filename = filename[: -len(suffix)]
+    decoded = base64.urlsafe_b64decode(filename.encode("ascii"))
+    return decoded.decode("utf-8")
+
 # =========================
 # Oracle Graph Client
 # =========================
@@ -81,7 +86,6 @@ def ensure_oracle_text_index(
 ):
     cursor = conn.cursor()
 
-    # 1. Verifica se índice existe e status
     cursor.execute("""
                    SELECT status
                    FROM user_indexes
@@ -92,7 +96,6 @@ def ensure_oracle_text_index(
     index_exists = row is not None
     index_status = row[0] if row else None
 
-    # 2. Se índice não existe → cria e NÃO sincroniza agora
     if not index_exists:
         print(f"🛠️ Creating Oracle Text index {index_name}")
 
@@ -107,7 +110,6 @@ def ensure_oracle_text_index(
         print(f"✅ Index {index_name} created (sync deferred)")
         return
 
-    # 3. Se índice existe mas está inválido → drop + recreate
     if index_status != "VALID":
         print(f"⚠️ Index {index_name} is {index_status}. Recreating...")
 
@@ -129,7 +131,6 @@ def ensure_oracle_text_index(
         print(f"♻️ Index {index_name} recreated (sync deferred)")
         return
 
-    # 4. Índice existe e está VALID → sincroniza com proteção
     print(f"🔄 Syncing Oracle Text index: {index_name}")
     try:
         cursor.execute(f"""
@@ -191,19 +192,21 @@ def create_tables_if_not_exist(conn):
 
 
 create_tables_if_not_exist(oracle_conn)
-ensure_oracle_text_index(
-    oracle_conn,
-    "ENTITIES_" + GRAPH_NAME,
-    "NAME",
-    "IDX_ENT_" + GRAPH_NAME + "_NAME"
-)
 
-ensure_oracle_text_index(
-    oracle_conn,
-    "RELATIONS_" + GRAPH_NAME,
-    "RELATION_TYPE",
-    "IDX_REL_" + GRAPH_NAME + "_RELTYPE"
-)
+# IF GRAPH INDEX PROBLEM, Reindex
+# ensure_oracle_text_index(
+#     oracle_conn,
+#     "ENTITIES_" + GRAPH_NAME,
+#     "NAME",
+#     "IDX_ENT_" + GRAPH_NAME + "_NAME"
+# )
+#
+# ensure_oracle_text_index(
+#     oracle_conn,
+#     "RELATIONS_" + GRAPH_NAME,
+#     "RELATION_TYPE",
+#     "IDX_REL_" + GRAPH_NAME + "_RELTYPE"
+# )
 
 def create_knowledge_graph(chunks):
     cursor = oracle_conn.cursor()
@@ -317,24 +320,145 @@ def create_knowledge_graph(chunks):
 
 def parse_rfp_requirement(question: str) -> dict:
     prompt = f"""
-You are an RFP requirement extractor.
-
-Return the result STRICTLY between the tags <json> and </json>.
-Do NOT write anything outside these tags.
-
-Question:
-{question}
-
-<json>
-{{
-  "requirement_type": "COMPLIANCE | FUNCTIONAL | NON_FUNCTIONAL",
-  "subject": "<short subject>",
-  "expected_value": "<value or condition if any>",
-  "decision_type": "YES_NO | YES_NO_PARTIAL",
-  "keywords": ["keyword1", "keyword2"]
-}}
-</json>
-"""
+        You are an RFP requirement NORMALIZER for Oracle Cloud Infrastructure (OCI).
+        
+        Your job is NOT to summarize the question.
+        Your job is to STRUCTURE the requirement so it can be searched in:
+        - Technical documentation
+        - Knowledge Graph
+        - Vector databases
+        
+        ────────────────────────────────
+        STEP 1 — Understand the requirement
+        ────────────────────────────────
+        From the question, identify:
+        1. The PRIMARY OCI SERVICE CATEGORY involved
+        2. The MAIN TECHNICAL SUBJECT (short and precise)
+        3. The EXPECTED TECHNICAL CAPABILITY or CONDITION (if any)
+        
+        IMPORTANT:
+        - Ignore marketing language
+        - Ignore phrases like "possui", "permite", "oferece"
+        - Focus ONLY on concrete technical meaning
+        
+        ────────────────────────────────
+        STEP 2 — Mandatory service classification
+        ────────────────────────────────
+        You MUST choose ONE primary technology from the list below
+        and INCLUDE IT EXPLICITLY in the keywords list.
+        
+        Choose the MOST SPECIFIC applicable item.
+        
+        ☁️ OCI SERVICE CATEGORIES (MANDATORY)
+        
+        🖥️ Compute (IaaS)
+        - compute
+        - compute instances
+        - virtual machine
+        - bare metal
+        - gpu
+        - hpc
+        - confidential computing
+        - autoscaling
+        - instance pools
+        - live migration
+        - ocvs (vmware)
+        - arm compute
+        
+        💾 Storage
+        - object storage
+        - archive storage
+        - block volume
+        - boot volume
+        - file storage
+        - volume groups
+        - snapshots
+        - replication
+        
+        🌐 Networking
+        - vcn
+        - load balancer
+        - network load balancer
+        - dns
+        - fastconnect
+        - drg
+        - firewall
+        - waf
+        - bastion
+        - vtap
+        - private endpoint
+        
+        🔐 Security & Identity
+        - iam
+        - compartments
+        - policies
+        - oci vault
+        - key management
+        - certificates
+        - secrets
+        - cloud guard
+        - security zones
+        - vulnerability scanning
+        - data safe
+        - audit
+        - logging
+        - shielded instances
+        
+        📦 Containers & Cloud Native
+        - oke
+        - kubernetes
+        - container registry
+        - api gateway
+        - functions
+        - streaming
+        - events
+        - service mesh
+        
+        🗄️ Databases
+        - autonomous database
+        - adw
+        - atp
+        - base database
+        - exadata
+        - mysql
+        - nosql
+        
+        📊 Analytics & AI
+        - analytics cloud
+        - data science
+        - data catalog
+        - big data service
+        - generative ai
+        - ai services
+        
+        ────────────────────────────────
+        STEP 3 — Keywords rules (CRITICAL)
+        ────────────────────────────────
+        The "keywords" field MUST:
+        - ALWAYS include at least ONE OCI service keyword (e.g. "compute", "object storage", "oke")
+        - Include technical capability terms (e.g. resize, autoscaling, encryption)
+        - NEVER include generic verbs (permitir, possuir, oferecer)
+        - NEVER include full sentences
+        
+        ────────────────────────────────
+        STEP 4 — Output rules
+        ────────────────────────────────
+        Return ONLY valid JSON between <json> tags.
+        Do NOT explain your reasoning.
+        
+        Question:
+        {question}
+        
+        <json>
+        {{
+          "requirement_type": "COMPLIANCE | FUNCTIONAL | NON_FUNCTIONAL",
+          "subject": "<short technical subject, e.g. 'Compute Instances'>",
+          "expected_value": "<technical capability or condition, or empty string>",
+          "decision_type": "YES_NO | YES_NO_PARTIAL",
+          "keywords": ["mandatory_oci_service", "technical_capability", "additional_term"]
+        }}
+        </json>
+        """
 
     resp = llm_for_rag.invoke(prompt)
     raw = resp.content.strip()
@@ -498,7 +622,8 @@ def semantic_chunking(text):
     2. Separate paragraphs by heading
     3. Indicate columns with [COLUMN 1], [COLUMN 2] if present
     4. Indicate tables with [TABLE] in markdown format
-    5. Indicate explicity metrics (if it exists)
+    5. ALWAYS PUT THE URL if there is a Reference
+    6. Indicate explicity metrics (if it exists)
        Examples:
          - Oracle Financial Services RTO is 1 hour
          - The Oracle Banking Supply Chain Finance Cloud Service A maximum number of 10K Hosted Transactions
@@ -515,7 +640,6 @@ def semantic_chunking(text):
 
     return response
 
-
 def read_pdfs(pdf_path):
     if "-ocr" in pdf_path:
         doc_pages = PyMuPDFLoader(str(pdf_path)).load()
@@ -568,7 +692,11 @@ def save_indexed_docs(docs):
 # Main Function
 # =========================
 def chat():
-    pdf_paths = ['RFP - Financial v2.pdf']
+    PDF_FOLDER = Path("docs")  # pasta onde estão os PDFs
+
+    pdf_paths = sorted(
+        str(p) for p in PDF_FOLDER.glob("*.pdf")
+    )
 
     already_indexed_docs = load_previously_indexed_docs()
     updated_docs = set()
@@ -588,6 +716,7 @@ def chat():
             print(f"✅ Document already indexed: {pdf_path}")
             continue
         full_text = read_pdfs(pdf_path=pdf_path)
+        path_url = filename_to_url(os.path.basename(pdf_path))
 
         text_chunks = smart_split_text(full_text, max_chunk_size=10_000)
         overflow_buffer = ""
@@ -610,7 +739,9 @@ def chat():
                     overflow_buffer = ""
 
                 for chapter_text in chapters:
-                    doc = Document(page_content=chapter_text, metadata={"source": pdf_path})
+                    reference_url = "Reference: " + path_url
+                    chapter_text = chapter_text + "\n" + reference_url
+                    doc = Document(page_content=chapter_text, metadata={"source": pdf_path, "reference": reference_url})
                     new_chunks.append(doc)
                     print(f"✅ New chapter indexed:\n{chapter_text}...\n")
 
@@ -653,9 +784,14 @@ def chat():
     
     Decision rules:
     - Answer ONLY with YES, NO or PARTIAL
-    - Do NOT assume anything not explicitly stated
     - If value differs, answer PARTIAL
     - If not found, answer NO
+
+    Interpretation rules (MANDATORY):
+    - If a capability is supported but requires reboot, downtime, or restart, it STILL counts as YES unless the requirement explicitly forbids it.
+    - "Servidor em funcionamento" means the resource exists and is active before the operation, not that it must remain online without interruption.
+    - Only answer NO if the operation is NOT supported at all or requires destroying and recreating the resource.
+    - Reboot, restart, or brief unavailability MUST NOT be interpreted as lack of support.
     
     Confidence rules:
     - HIGH: Explicit evidence directly answers the requirement
@@ -671,7 +807,7 @@ def chat():
     Service scope rules (MANDATORY):
     - Evidence is valid ONLY if it refers to the SAME service category as the requirement.
     - Do NOT use evidence from a different Oracle Cloud service to justify another.
-    
+
     OUTPUT CONSTRAINTS (MANDATORY):
     - Return ONLY a valid JSON object
     - Do NOT include explanations, comments, markdown, lists, or code fences
@@ -695,22 +831,6 @@ def chat():
     """
     prompt = PromptTemplate.from_template(RFP_DECISION_TEMPLATE)
 
-    def get_context(x):
-        query = x.get("input") if isinstance(x, dict) else x
-
-        # 1. Recupera chunks vetoriais normalmente
-        docs = retriever.invoke(query)
-
-        req = parse_rfp_requirement(query)
-        query_terms = extract_graph_keywords_from_requirement(req)
-        graph_context = query_knowledge_graph(query_terms)
-
-        graph_terms = extract_terms_from_graph_text(graph_context)
-
-        reranked_chunks = rerank_documents_with_graph_terms(docs, query, graph_terms)
-
-        return "\n\n".join(reranked_chunks)
-
     def get_context_from_requirement(req: dict):
         query_terms = extract_graph_keywords_from_requirement(req)
 
@@ -757,21 +877,6 @@ def chat():
         print(response)
         print("\n" + "=" * 80 + "\n")
 
-def get_context(x):
-    query = x.get("input") if isinstance(x, dict) else x
-
-    docs = retriever.invoke(query)
-
-    req = parse_rfp_requirement(query)
-    query_terms = extract_graph_keywords_from_requirement(req)
-    graph_context = query_knowledge_graph(query_terms)
-
-    graph_terms = extract_terms_from_graph_text(graph_context)
-
-    reranked_chunks = rerank_documents_with_graph_terms(docs, query, graph_terms)
-
-    return "\n\n".join(reranked_chunks)
-
 def get_context_from_requirement(req: dict):
     query_terms = extract_graph_keywords_from_requirement(req)
 
@@ -823,10 +928,15 @@ Graph evidence:
 
 Decision rules:
 - Answer ONLY with YES, NO or PARTIAL
-- Do NOT assume anything not explicitly stated
 - If value differs, answer PARTIAL
 - If not found, answer NO
 
+Interpretation rules (MANDATORY):
+- If a capability is supported but requires reboot, downtime, or restart, it STILL counts as YES unless the requirement explicitly forbids it.
+- "Servidor em funcionamento" means the resource exists and is active before the operation, not that it must remain online without interruption.
+- Only answer NO if the operation is NOT supported at all or requires destroying and recreating the resource.
+- Reboot, restart, or brief unavailability MUST NOT be interpreted as lack of support.
+
 Confidence rules:
 - HIGH: Explicit evidence directly answers the requirement
 - MEDIUM: Evidence partially matches or requires light interpretation
@@ -841,6 +951,10 @@ Ambiguity rules:
 Service scope rules (MANDATORY):
 - Evidence is valid ONLY if it refers to the SAME service category as the requirement.
 - Do NOT use evidence from a different Oracle Cloud service to justify another.
+- PaaS services (e.g. Autonomous Database) MUST NOT be used as evidence for IaaS/Compute requirements.
+- If the requirement is under Compute/IaaS, evidence MUST explicitly mention Compute, IaaS, VM, Bare Metal, or equivalent infrastructure services.
+- Cross-service inference (vendor-level capability applied to another service) is strictly forbidden.
+- Get all URL references or sources as evidences
 
 OUTPUT CONSTRAINTS (MANDATORY):
 - Return ONLY a valid JSON object