From 51cf883150699cab773738d1d9cb1eee5c86927a Mon Sep 17 00:00:00 2001 From: Cristiano Hoshikawa Date: Wed, 14 Jan 2026 09:44:22 -0300 Subject: [PATCH] adjust --- .DS_Store | Bin 6148 -> 10244 bytes files/graphrag_rerank.py | 268 ++++++++++++++++++++++++++++----------- 2 files changed, 191 insertions(+), 77 deletions(-) diff --git a/.DS_Store b/.DS_Store index 154b1dea0b56a0bebf521d7761b0544679974fda..d9a783d85bc43d45877424a41066d6984c7beb97 100644 GIT binary patch literal 10244 zcmeHMYitx%6uxKL!i;6?6pFBY$doEpE%pJlJPdAMfl?4;`vxhqJ3~8RI#YLMcPq6? z4L&d!VvG;eAEN$6e8t39B7zYkAtGsrN+KFGni%|{sEI#}=iWP8b_-R18O6ED+%xw) z?%DIrId|sXC4@jzLa!x6C4`80QL42*mHd`=NkKzm{h)J9^?(C{Ui0V6;l zKp;RMKp;RMK;Ykk0KT(n6Z1I@`T&6dfdGL?1nByp;zeoLr{kQmr2{**1wfjMY}W8z zp*KiXpN4%p&M8W;0XId;O~Kz{05?az>g$DlI?gFKC*W^B!1oOP4h5KZoL^gx&Ei};lYfv zf%21RO7hUJNpxZmEz2{hK9McW^nGOcV7c!*!+yxQ=)LCXYo)*LVZSazBg;QIK>gFW z9TJ=BJ!eg(EYo*;dC|!m`OK5HnVv5v&-Tk!r#!L)3c2OWg4k|rKs7lJ_<6%Zi>`RHuHdUdeQ-(@dFm6tMX^bSIfP9y<4`* z+j`8HV@J$gM!!{G|y)_@y1~~joW3!U8EJ^ z8jgZ=s9=8C!bLaNEL(Y7%i2vH6H4LiIYo12xexKSJ#&X?#5+@_?hW>8Zp^TC(;nzf znywkOj1F5HG74d>Bu&gIoUMe4L#3ipA@(Gq=3NQXyPI+41)*|Dd0pJ9n{jhFN#|zzd3GT&)ITi=q5?g#10oR;ZrP@t0l#iI(A}GaML<@si^2ur=>*=i?bT! z6{6BEbtid*oO!FfS}cSrsRs}DhA|{d4w38-Q2R;m5 zB%&x)ge&<|so;c=io?VFvw&eODfDAw0jVS_NfYTJ8ga=zWFL8o93=0NkH~TIIXOpu zAQ#B*Faz@8YA65&LQo8)PzH;j8fu^pmOvQVU>&qWH*5zDVxWTwad064Nf?Fu;C^@z z_P{=P686K>@C>{F2jN9{8D4?6;3&KgAHXp<4QJs?_zJ#<3-FteE6ft+2_d0aC=sp` z>V*bjxzH$d2%SQg&@F5iOd&3$gb^N=kgmu6beKMW7t+jnbd&xPBfKgLJx}wX)iY^r`3E!4j32ZT(^#fR*s=$$9cK0H*pJn2!lw1I!=9~EJiq?|JuYk$7Y394Q z`rh{Wd;89cGkDG7`5f#~wg#y11xD&;kSk1OfyC1Of!EBm{DJFe*L& zZ=L@C|0^lvAeI1u0D&n8KyFKESFZooaoiDAJ_WCQL55*)a(-?BP!9ux+yfBF3N$;D zA%!88Au%T%NtQu<5(5K2vNX{0$yWvBCtniSF*#O.pdf'] +GRAPH_NAME = "OCI_GRAPH" # ========================= # LLM Definitions @@ -52,7 +52,6 @@ llm_for_rag = ChatOCIGenAI( auth_profile="DEFAULT", ) - embeddings = OCIGenAIEmbeddings( model_id="cohere.embed-multilingual-v3.0", service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com", @@ -70,6 +69,12 @@ oracle_conn = oracledb.connect( ) atexit.register(lambda: oracle_conn.close()) +def filename_to_url(filename: str, suffix: str = ".pdf") -> str: + if filename.endswith(suffix): + filename = filename[: -len(suffix)] + decoded = base64.urlsafe_b64decode(filename.encode("ascii")) + return decoded.decode("utf-8") + # ========================= # Oracle Graph Client # ========================= @@ -81,7 +86,6 @@ def ensure_oracle_text_index( ): cursor = conn.cursor() - # 1. Verifica se índice existe e status cursor.execute(""" SELECT status FROM user_indexes @@ -92,7 +96,6 @@ def ensure_oracle_text_index( index_exists = row is not None index_status = row[0] if row else None - # 2. Se índice não existe → cria e NÃO sincroniza agora if not index_exists: print(f"🛠️ Creating Oracle Text index {index_name}") @@ -107,7 +110,6 @@ def ensure_oracle_text_index( print(f"✅ Index {index_name} created (sync deferred)") return - # 3. Se índice existe mas está inválido → drop + recreate if index_status != "VALID": print(f"⚠️ Index {index_name} is {index_status}. Recreating...") @@ -129,7 +131,6 @@ def ensure_oracle_text_index( print(f"♻️ Index {index_name} recreated (sync deferred)") return - # 4. Índice existe e está VALID → sincroniza com proteção print(f"🔄 Syncing Oracle Text index: {index_name}") try: cursor.execute(f""" @@ -191,19 +192,21 @@ def create_tables_if_not_exist(conn): create_tables_if_not_exist(oracle_conn) -ensure_oracle_text_index( - oracle_conn, - "ENTITIES_" + GRAPH_NAME, - "NAME", - "IDX_ENT_" + GRAPH_NAME + "_NAME" -) -ensure_oracle_text_index( - oracle_conn, - "RELATIONS_" + GRAPH_NAME, - "RELATION_TYPE", - "IDX_REL_" + GRAPH_NAME + "_RELTYPE" -) +# IF GRAPH INDEX PROBLEM, Reindex +# ensure_oracle_text_index( +# oracle_conn, +# "ENTITIES_" + GRAPH_NAME, +# "NAME", +# "IDX_ENT_" + GRAPH_NAME + "_NAME" +# ) +# +# ensure_oracle_text_index( +# oracle_conn, +# "RELATIONS_" + GRAPH_NAME, +# "RELATION_TYPE", +# "IDX_REL_" + GRAPH_NAME + "_RELTYPE" +# ) def create_knowledge_graph(chunks): cursor = oracle_conn.cursor() @@ -317,24 +320,145 @@ def create_knowledge_graph(chunks): def parse_rfp_requirement(question: str) -> dict: prompt = f""" -You are an RFP requirement extractor. - -Return the result STRICTLY between the tags and . -Do NOT write anything outside these tags. - -Question: -{question} - - -{{ - "requirement_type": "COMPLIANCE | FUNCTIONAL | NON_FUNCTIONAL", - "subject": "", - "expected_value": "", - "decision_type": "YES_NO | YES_NO_PARTIAL", - "keywords": ["keyword1", "keyword2"] -}} - -""" + You are an RFP requirement NORMALIZER for Oracle Cloud Infrastructure (OCI). + + Your job is NOT to summarize the question. + Your job is to STRUCTURE the requirement so it can be searched in: + - Technical documentation + - Knowledge Graph + - Vector databases + + ──────────────────────────────── + STEP 1 — Understand the requirement + ──────────────────────────────── + From the question, identify: + 1. The PRIMARY OCI SERVICE CATEGORY involved + 2. The MAIN TECHNICAL SUBJECT (short and precise) + 3. The EXPECTED TECHNICAL CAPABILITY or CONDITION (if any) + + IMPORTANT: + - Ignore marketing language + - Ignore phrases like "possui", "permite", "oferece" + - Focus ONLY on concrete technical meaning + + ──────────────────────────────── + STEP 2 — Mandatory service classification + ──────────────────────────────── + You MUST choose ONE primary technology from the list below + and INCLUDE IT EXPLICITLY in the keywords list. + + Choose the MOST SPECIFIC applicable item. + + ☁️ OCI SERVICE CATEGORIES (MANDATORY) + + 🖥️ Compute (IaaS) + - compute + - compute instances + - virtual machine + - bare metal + - gpu + - hpc + - confidential computing + - autoscaling + - instance pools + - live migration + - ocvs (vmware) + - arm compute + + 💾 Storage + - object storage + - archive storage + - block volume + - boot volume + - file storage + - volume groups + - snapshots + - replication + + 🌐 Networking + - vcn + - load balancer + - network load balancer + - dns + - fastconnect + - drg + - firewall + - waf + - bastion + - vtap + - private endpoint + + 🔐 Security & Identity + - iam + - compartments + - policies + - oci vault + - key management + - certificates + - secrets + - cloud guard + - security zones + - vulnerability scanning + - data safe + - audit + - logging + - shielded instances + + 📦 Containers & Cloud Native + - oke + - kubernetes + - container registry + - api gateway + - functions + - streaming + - events + - service mesh + + 🗄️ Databases + - autonomous database + - adw + - atp + - base database + - exadata + - mysql + - nosql + + 📊 Analytics & AI + - analytics cloud + - data science + - data catalog + - big data service + - generative ai + - ai services + + ──────────────────────────────── + STEP 3 — Keywords rules (CRITICAL) + ──────────────────────────────── + The "keywords" field MUST: + - ALWAYS include at least ONE OCI service keyword (e.g. "compute", "object storage", "oke") + - Include technical capability terms (e.g. resize, autoscaling, encryption) + - NEVER include generic verbs (permitir, possuir, oferecer) + - NEVER include full sentences + + ──────────────────────────────── + STEP 4 — Output rules + ──────────────────────────────── + Return ONLY valid JSON between tags. + Do NOT explain your reasoning. + + Question: + {question} + + + {{ + "requirement_type": "COMPLIANCE | FUNCTIONAL | NON_FUNCTIONAL", + "subject": "", + "expected_value": "", + "decision_type": "YES_NO | YES_NO_PARTIAL", + "keywords": ["mandatory_oci_service", "technical_capability", "additional_term"] + }} + + """ resp = llm_for_rag.invoke(prompt) raw = resp.content.strip() @@ -498,7 +622,8 @@ def semantic_chunking(text): 2. Separate paragraphs by heading 3. Indicate columns with [COLUMN 1], [COLUMN 2] if present 4. Indicate tables with [TABLE] in markdown format - 5. Indicate explicity metrics (if it exists) + 5. ALWAYS PUT THE URL if there is a Reference + 6. Indicate explicity metrics (if it exists) Examples: - Oracle Financial Services RTO is 1 hour - The Oracle Banking Supply Chain Finance Cloud Service A maximum number of 10K Hosted Transactions @@ -515,7 +640,6 @@ def semantic_chunking(text): return response - def read_pdfs(pdf_path): if "-ocr" in pdf_path: doc_pages = PyMuPDFLoader(str(pdf_path)).load() @@ -568,7 +692,11 @@ def save_indexed_docs(docs): # Main Function # ========================= def chat(): - pdf_paths = ['RFP - Financial v2.pdf'] + PDF_FOLDER = Path("docs") # pasta onde estão os PDFs + + pdf_paths = sorted( + str(p) for p in PDF_FOLDER.glob("*.pdf") + ) already_indexed_docs = load_previously_indexed_docs() updated_docs = set() @@ -588,6 +716,7 @@ def chat(): print(f"✅ Document already indexed: {pdf_path}") continue full_text = read_pdfs(pdf_path=pdf_path) + path_url = filename_to_url(os.path.basename(pdf_path)) text_chunks = smart_split_text(full_text, max_chunk_size=10_000) overflow_buffer = "" @@ -610,7 +739,9 @@ def chat(): overflow_buffer = "" for chapter_text in chapters: - doc = Document(page_content=chapter_text, metadata={"source": pdf_path}) + reference_url = "Reference: " + path_url + chapter_text = chapter_text + "\n" + reference_url + doc = Document(page_content=chapter_text, metadata={"source": pdf_path, "reference": reference_url}) new_chunks.append(doc) print(f"✅ New chapter indexed:\n{chapter_text}...\n") @@ -653,9 +784,14 @@ def chat(): Decision rules: - Answer ONLY with YES, NO or PARTIAL - - Do NOT assume anything not explicitly stated - If value differs, answer PARTIAL - If not found, answer NO + + Interpretation rules (MANDATORY): + - If a capability is supported but requires reboot, downtime, or restart, it STILL counts as YES unless the requirement explicitly forbids it. + - "Servidor em funcionamento" means the resource exists and is active before the operation, not that it must remain online without interruption. + - Only answer NO if the operation is NOT supported at all or requires destroying and recreating the resource. + - Reboot, restart, or brief unavailability MUST NOT be interpreted as lack of support. Confidence rules: - HIGH: Explicit evidence directly answers the requirement @@ -671,7 +807,7 @@ def chat(): Service scope rules (MANDATORY): - Evidence is valid ONLY if it refers to the SAME service category as the requirement. - Do NOT use evidence from a different Oracle Cloud service to justify another. - + OUTPUT CONSTRAINTS (MANDATORY): - Return ONLY a valid JSON object - Do NOT include explanations, comments, markdown, lists, or code fences @@ -695,22 +831,6 @@ def chat(): """ prompt = PromptTemplate.from_template(RFP_DECISION_TEMPLATE) - def get_context(x): - query = x.get("input") if isinstance(x, dict) else x - - # 1. Recupera chunks vetoriais normalmente - docs = retriever.invoke(query) - - req = parse_rfp_requirement(query) - query_terms = extract_graph_keywords_from_requirement(req) - graph_context = query_knowledge_graph(query_terms) - - graph_terms = extract_terms_from_graph_text(graph_context) - - reranked_chunks = rerank_documents_with_graph_terms(docs, query, graph_terms) - - return "\n\n".join(reranked_chunks) - def get_context_from_requirement(req: dict): query_terms = extract_graph_keywords_from_requirement(req) @@ -757,21 +877,6 @@ def chat(): print(response) print("\n" + "=" * 80 + "\n") -def get_context(x): - query = x.get("input") if isinstance(x, dict) else x - - docs = retriever.invoke(query) - - req = parse_rfp_requirement(query) - query_terms = extract_graph_keywords_from_requirement(req) - graph_context = query_knowledge_graph(query_terms) - - graph_terms = extract_terms_from_graph_text(graph_context) - - reranked_chunks = rerank_documents_with_graph_terms(docs, query, graph_terms) - - return "\n\n".join(reranked_chunks) - def get_context_from_requirement(req: dict): query_terms = extract_graph_keywords_from_requirement(req) @@ -823,10 +928,15 @@ Graph evidence: Decision rules: - Answer ONLY with YES, NO or PARTIAL -- Do NOT assume anything not explicitly stated - If value differs, answer PARTIAL - If not found, answer NO +Interpretation rules (MANDATORY): +- If a capability is supported but requires reboot, downtime, or restart, it STILL counts as YES unless the requirement explicitly forbids it. +- "Servidor em funcionamento" means the resource exists and is active before the operation, not that it must remain online without interruption. +- Only answer NO if the operation is NOT supported at all or requires destroying and recreating the resource. +- Reboot, restart, or brief unavailability MUST NOT be interpreted as lack of support. + Confidence rules: - HIGH: Explicit evidence directly answers the requirement - MEDIUM: Evidence partially matches or requires light interpretation @@ -841,6 +951,10 @@ Ambiguity rules: Service scope rules (MANDATORY): - Evidence is valid ONLY if it refers to the SAME service category as the requirement. - Do NOT use evidence from a different Oracle Cloud service to justify another. +- PaaS services (e.g. Autonomous Database) MUST NOT be used as evidence for IaaS/Compute requirements. +- If the requirement is under Compute/IaaS, evidence MUST explicitly mention Compute, IaaS, VM, Bare Metal, or equivalent infrastructure services. +- Cross-service inference (vendor-level capability applied to another service) is strictly forbidden. +- Get all URL references or sources as evidences OUTPUT CONSTRAINTS (MANDATORY): - Return ONLY a valid JSON object