adjust

2026-03-03 16:09:35 +00:00 · 2026-01-14 09:44:22 -03:00
parent 1dce8e20d3
commit 51cf883150
2 changed files with 191 additions and 77 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/files/graphrag_rerank.py
+++ b/files/graphrag_rerank.py
@@ -7,7 +7,7 @@ from langchain.schema.runnable import RunnableMap
 from langchain_community.document_loaders import UnstructuredPDFLoader, PyMuPDFLoader
 from langchain_core.documents import Document
 from langchain_core.runnables import RunnableLambda
-
+from pathlib import Path
 from tqdm import tqdm
 import os
 import pickle
@@ -15,6 +15,7 @@ import re
 import atexit
 import oracledb
 import json
+import base64

 # =========================
 # Oracle Autonomous Configuration
@@ -22,9 +23,8 @@ import json
 WALLET_PATH = "Wallet_oradb23ai"
 DB_ALIAS = "oradb23ai_high"
 USERNAME = "admin"
-PASSWORD = "**********"
+PASSWORD = "Moniquinha1972"
 os.environ["TNS_ADMIN"] = WALLET_PATH
-GRAPH_NAME = "GRAPH_DB_1"

 # =========================
 # Global Configurations
@@ -32,7 +32,7 @@ GRAPH_NAME = "GRAPH_DB_1"
 INDEX_PATH = "./faiss_index"
 PROCESSED_DOCS_FILE = os.path.join(INDEX_PATH, "processed_docs.pkl")
 chapter_separator_regex = r"^(#{1,6} .+|\*\*.+\*\*)$"
-pdf_paths = ['<YOUR_KNOWLEDGE_BASE_FILE>.pdf']
+GRAPH_NAME = "OCI_GRAPH"

 # =========================
 # LLM Definitions
@@ -52,7 +52,6 @@ llm_for_rag = ChatOCIGenAI(
    auth_profile="DEFAULT",
 )

-
 embeddings = OCIGenAIEmbeddings(
    model_id="cohere.embed-multilingual-v3.0",
    service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
@@ -70,6 +69,12 @@ oracle_conn = oracledb.connect(
 )
 atexit.register(lambda: oracle_conn.close())

+def filename_to_url(filename: str, suffix: str = ".pdf") -> str:
+    if filename.endswith(suffix):
+        filename = filename[: -len(suffix)]
+    decoded = base64.urlsafe_b64decode(filename.encode("ascii"))
+    return decoded.decode("utf-8")
+
 # =========================
 # Oracle Graph Client
 # =========================
@@ -81,7 +86,6 @@ def ensure_oracle_text_index(
 ):
    cursor = conn.cursor()

-    # 1. Verifica se índice existe e status
    cursor.execute("""
                   SELECT status
                   FROM user_indexes
@@ -92,7 +96,6 @@ def ensure_oracle_text_index(
    index_exists = row is not None
    index_status = row[0] if row else None

-    # 2. Se índice não existe → cria e NÃO sincroniza agora
    if not index_exists:
        print(f"🛠️ Creating Oracle Text index {index_name}")

@@ -107,7 +110,6 @@ def ensure_oracle_text_index(
        print(f"✅ Index {index_name} created (sync deferred)")
        return

-    # 3. Se índice existe mas está inválido → drop + recreate
    if index_status != "VALID":
        print(f"⚠️ Index {index_name} is {index_status}. Recreating...")

@@ -129,7 +131,6 @@ def ensure_oracle_text_index(
        print(f"♻️ Index {index_name} recreated (sync deferred)")
        return

-    # 4. Índice existe e está VALID → sincroniza com proteção
    print(f"🔄 Syncing Oracle Text index: {index_name}")
    try:
        cursor.execute(f"""
@@ -191,19 +192,21 @@ def create_tables_if_not_exist(conn):


 create_tables_if_not_exist(oracle_conn)
-ensure_oracle_text_index(
-    oracle_conn,
-    "ENTITIES_" + GRAPH_NAME,
-    "NAME",
-    "IDX_ENT_" + GRAPH_NAME + "_NAME"
-)

-ensure_oracle_text_index(
-    oracle_conn,
-    "RELATIONS_" + GRAPH_NAME,
-    "RELATION_TYPE",
-    "IDX_REL_" + GRAPH_NAME + "_RELTYPE"
-)
+# IF GRAPH INDEX PROBLEM, Reindex
+# ensure_oracle_text_index(
+#     oracle_conn,
+#     "ENTITIES_" + GRAPH_NAME,
+#     "NAME",
+#     "IDX_ENT_" + GRAPH_NAME + "_NAME"
+# )
+#
+# ensure_oracle_text_index(
+#     oracle_conn,
+#     "RELATIONS_" + GRAPH_NAME,
+#     "RELATION_TYPE",
+#     "IDX_REL_" + GRAPH_NAME + "_RELTYPE"
+# )

 def create_knowledge_graph(chunks):
    cursor = oracle_conn.cursor()
@@ -317,24 +320,145 @@ def create_knowledge_graph(chunks):

 def parse_rfp_requirement(question: str) -> dict:
    prompt = f"""
-You are an RFP requirement extractor.
-
-Return the result STRICTLY between the tags <json> and </json>.
-Do NOT write anything outside these tags.
-
-Question:
-{question}
-
-<json>
-{{
-  "requirement_type": "COMPLIANCE | FUNCTIONAL | NON_FUNCTIONAL",
-  "subject": "<short subject>",
-  "expected_value": "<value or condition if any>",
-  "decision_type": "YES_NO | YES_NO_PARTIAL",
-  "keywords": ["keyword1", "keyword2"]
-}}
-</json>
-"""
+        You are an RFP requirement NORMALIZER for Oracle Cloud Infrastructure (OCI).
+        
+        Your job is NOT to summarize the question.
+        Your job is to STRUCTURE the requirement so it can be searched in:
+        - Technical documentation
+        - Knowledge Graph
+        - Vector databases
+        
+        ────────────────────────────────
+        STEP 1 — Understand the requirement
+        ────────────────────────────────
+        From the question, identify:
+        1. The PRIMARY OCI SERVICE CATEGORY involved
+        2. The MAIN TECHNICAL SUBJECT (short and precise)
+        3. The EXPECTED TECHNICAL CAPABILITY or CONDITION (if any)
+        
+        IMPORTANT:
+        - Ignore marketing language
+        - Ignore phrases like "possui", "permite", "oferece"
+        - Focus ONLY on concrete technical meaning
+        
+        ────────────────────────────────
+        STEP 2 — Mandatory service classification
+        ────────────────────────────────
+        You MUST choose ONE primary technology from the list below
+        and INCLUDE IT EXPLICITLY in the keywords list.
+        
+        Choose the MOST SPECIFIC applicable item.
+        
+        ☁️ OCI SERVICE CATEGORIES (MANDATORY)
+        
+        🖥️ Compute (IaaS)
+        - compute
+        - compute instances
+        - virtual machine
+        - bare metal
+        - gpu
+        - hpc
+        - confidential computing
+        - autoscaling
+        - instance pools
+        - live migration
+        - ocvs (vmware)
+        - arm compute
+        
+        💾 Storage
+        - object storage
+        - archive storage
+        - block volume
+        - boot volume
+        - file storage
+        - volume groups
+        - snapshots
+        - replication
+        
+        🌐 Networking
+        - vcn
+        - load balancer
+        - network load balancer
+        - dns
+        - fastconnect
+        - drg
+        - firewall
+        - waf
+        - bastion
+        - vtap
+        - private endpoint
+        
+        🔐 Security & Identity
+        - iam
+        - compartments
+        - policies
+        - oci vault
+        - key management
+        - certificates
+        - secrets
+        - cloud guard
+        - security zones
+        - vulnerability scanning
+        - data safe
+        - audit
+        - logging
+        - shielded instances
+        
+        📦 Containers & Cloud Native
+        - oke
+        - kubernetes
+        - container registry
+        - api gateway
+        - functions
+        - streaming
+        - events
+        - service mesh
+        
+        🗄️ Databases
+        - autonomous database
+        - adw
+        - atp
+        - base database
+        - exadata
+        - mysql
+        - nosql
+        
+        📊 Analytics & AI
+        - analytics cloud
+        - data science
+        - data catalog
+        - big data service
+        - generative ai
+        - ai services
+        
+        ────────────────────────────────
+        STEP 3 — Keywords rules (CRITICAL)
+        ────────────────────────────────
+        The "keywords" field MUST:
+        - ALWAYS include at least ONE OCI service keyword (e.g. "compute", "object storage", "oke")
+        - Include technical capability terms (e.g. resize, autoscaling, encryption)
+        - NEVER include generic verbs (permitir, possuir, oferecer)
+        - NEVER include full sentences
+        
+        ────────────────────────────────
+        STEP 4 — Output rules
+        ────────────────────────────────
+        Return ONLY valid JSON between <json> tags.
+        Do NOT explain your reasoning.
+        
+        Question:
+        {question}
+        
+        <json>
+        {{
+          "requirement_type": "COMPLIANCE | FUNCTIONAL | NON_FUNCTIONAL",
+          "subject": "<short technical subject, e.g. 'Compute Instances'>",
+          "expected_value": "<technical capability or condition, or empty string>",
+          "decision_type": "YES_NO | YES_NO_PARTIAL",
+          "keywords": ["mandatory_oci_service", "technical_capability", "additional_term"]
+        }}
+        </json>
+        """

    resp = llm_for_rag.invoke(prompt)
    raw = resp.content.strip()
@@ -498,7 +622,8 @@ def semantic_chunking(text):
    2. Separate paragraphs by heading
    3. Indicate columns with [COLUMN 1], [COLUMN 2] if present
    4. Indicate tables with [TABLE] in markdown format
-    5. Indicate explicity metrics (if it exists)
+    5. ALWAYS PUT THE URL if there is a Reference
+    6. Indicate explicity metrics (if it exists)
       Examples:
         - Oracle Financial Services RTO is 1 hour
         - The Oracle Banking Supply Chain Finance Cloud Service A maximum number of 10K Hosted Transactions
@@ -515,7 +640,6 @@ def semantic_chunking(text):

    return response

-
 def read_pdfs(pdf_path):
    if "-ocr" in pdf_path:
        doc_pages = PyMuPDFLoader(str(pdf_path)).load()
@@ -568,7 +692,11 @@ def save_indexed_docs(docs):
 # Main Function
 # =========================
 def chat():
-    pdf_paths = ['RFP - Financial v2.pdf']
+    PDF_FOLDER = Path("docs")  # pasta onde estão os PDFs
+
+    pdf_paths = sorted(
+        str(p) for p in PDF_FOLDER.glob("*.pdf")
+    )

    already_indexed_docs = load_previously_indexed_docs()
    updated_docs = set()
@@ -588,6 +716,7 @@ def chat():
            print(f"✅ Document already indexed: {pdf_path}")
            continue
        full_text = read_pdfs(pdf_path=pdf_path)
+        path_url = filename_to_url(os.path.basename(pdf_path))

        text_chunks = smart_split_text(full_text, max_chunk_size=10_000)
        overflow_buffer = ""
@@ -610,7 +739,9 @@ def chat():
                    overflow_buffer = ""

                for chapter_text in chapters:
-                    doc = Document(page_content=chapter_text, metadata={"source": pdf_path})
+                    reference_url = "Reference: " + path_url
+                    chapter_text = chapter_text + "\n" + reference_url
+                    doc = Document(page_content=chapter_text, metadata={"source": pdf_path, "reference": reference_url})
                    new_chunks.append(doc)
                    print(f"✅ New chapter indexed:\n{chapter_text}...\n")

@@ -653,9 +784,14 @@ def chat():
    
    Decision rules:
    - Answer ONLY with YES, NO or PARTIAL
-    - Do NOT assume anything not explicitly stated
    - If value differs, answer PARTIAL
    - If not found, answer NO
+
+    Interpretation rules (MANDATORY):
+    - If a capability is supported but requires reboot, downtime, or restart, it STILL counts as YES unless the requirement explicitly forbids it.
+    - "Servidor em funcionamento" means the resource exists and is active before the operation, not that it must remain online without interruption.
+    - Only answer NO if the operation is NOT supported at all or requires destroying and recreating the resource.
+    - Reboot, restart, or brief unavailability MUST NOT be interpreted as lack of support.
    
    Confidence rules:
    - HIGH: Explicit evidence directly answers the requirement
@@ -671,7 +807,7 @@ def chat():
    Service scope rules (MANDATORY):
    - Evidence is valid ONLY if it refers to the SAME service category as the requirement.
    - Do NOT use evidence from a different Oracle Cloud service to justify another.
-    
+
    OUTPUT CONSTRAINTS (MANDATORY):
    - Return ONLY a valid JSON object
    - Do NOT include explanations, comments, markdown, lists, or code fences
@@ -695,22 +831,6 @@ def chat():
    """
    prompt = PromptTemplate.from_template(RFP_DECISION_TEMPLATE)

-    def get_context(x):
-        query = x.get("input") if isinstance(x, dict) else x
-
-        # 1. Recupera chunks vetoriais normalmente
-        docs = retriever.invoke(query)
-
-        req = parse_rfp_requirement(query)
-        query_terms = extract_graph_keywords_from_requirement(req)
-        graph_context = query_knowledge_graph(query_terms)
-
-        graph_terms = extract_terms_from_graph_text(graph_context)
-
-        reranked_chunks = rerank_documents_with_graph_terms(docs, query, graph_terms)
-
-        return "\n\n".join(reranked_chunks)
-
    def get_context_from_requirement(req: dict):
        query_terms = extract_graph_keywords_from_requirement(req)

@@ -757,21 +877,6 @@ def chat():
        print(response)
        print("\n" + "=" * 80 + "\n")

-def get_context(x):
-    query = x.get("input") if isinstance(x, dict) else x
-
-    docs = retriever.invoke(query)
-
-    req = parse_rfp_requirement(query)
-    query_terms = extract_graph_keywords_from_requirement(req)
-    graph_context = query_knowledge_graph(query_terms)
-
-    graph_terms = extract_terms_from_graph_text(graph_context)
-
-    reranked_chunks = rerank_documents_with_graph_terms(docs, query, graph_terms)
-
-    return "\n\n".join(reranked_chunks)
-
 def get_context_from_requirement(req: dict):
    query_terms = extract_graph_keywords_from_requirement(req)

@@ -823,10 +928,15 @@ Graph evidence:

 Decision rules:
 - Answer ONLY with YES, NO or PARTIAL
- Do NOT assume anything not explicitly stated
 - If value differs, answer PARTIAL
 - If not found, answer NO

+Interpretation rules (MANDATORY):
+- If a capability is supported but requires reboot, downtime, or restart, it STILL counts as YES unless the requirement explicitly forbids it.
+- "Servidor em funcionamento" means the resource exists and is active before the operation, not that it must remain online without interruption.
+- Only answer NO if the operation is NOT supported at all or requires destroying and recreating the resource.
+- Reboot, restart, or brief unavailability MUST NOT be interpreted as lack of support.
+
 Confidence rules:
 - HIGH: Explicit evidence directly answers the requirement
 - MEDIUM: Evidence partially matches or requires light interpretation
@@ -841,6 +951,10 @@ Ambiguity rules:
 Service scope rules (MANDATORY):
 - Evidence is valid ONLY if it refers to the SAME service category as the requirement.
 - Do NOT use evidence from a different Oracle Cloud service to justify another.
+- PaaS services (e.g. Autonomous Database) MUST NOT be used as evidence for IaaS/Compute requirements.
+- If the requirement is under Compute/IaaS, evidence MUST explicitly mention Compute, IaaS, VM, Bare Metal, or equivalent infrastructure services.
+- Cross-service inference (vendor-level capability applied to another service) is strictly forbidden.
+- Get all URL references or sources as evidences

 OUTPUT CONSTRAINTS (MANDATORY):
 - Return ONLY a valid JSON object