commit 557ea10653bf9dca8dd99e47bacc0ee9d5d95349 Author: Cristiano Hoshikawa Date: Thu Jan 8 18:26:09 2026 -0300 first commit diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..7d9a8e5 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,12 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Environment-dependent path to Maven home directory +/mavenHomeManager.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Zeppelin ignored files +/ZeppelinRemoteNotebooks/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..c5a8feb --- /dev/null +++ b/README.md @@ -0,0 +1,325 @@ + +# 🧠 Oracle GraphRAG for RFP Validation + +**GraphRAG-based AI system for factual RFP requirement validation using Oracle 23ai, OCI Generative AI, and Vector Search** + +--- + +## πŸ“Œ Overview + +This project implements an **AI-driven RFP validation engine** designed to answer *formal RFP requirements* using **explicit, verifiable evidence** extracted from technical documentation. + +Instead of responding to open-ended conceptual questions, the system evaluates **whether a requirement is met**, returning **YES / NO / PARTIAL**, along with **exact textual evidence** and full traceability. + +The solution combines: + +- Retrieval-Augmented Generation (RAG) over PDFs +- GraphRAG for structured factual relationships +- Oracle 23ai Property Graph + Oracle Text +- OCI Generative AI (LLMs & Embeddings) +- FAISS vector search +- Flask REST API + +This project is based on the article: [Analyze PDF Documents in Natural Language with OCI Generative AI](https://docs.oracle.com/en/learn/oci-genai-pdf) + +See the details about this material to setup/configure your development environment, Oracle Autonomous Database AI and other components. + + +--- + +## 🎯 Why RFP-Centric (and not Concept Q&A) + +While typical knowledge base projects focus on extracting information about concepts, step-by-step instructions, and numerous answers to questions asked about a particular subject, an RFP requires a very special approach. + +>**Note:** Traditional RAG systems are optimized for *conceptual explanations*. RFPs require **objective validation**, not interpretation. + +This project shifts the AI role from: + +❌ *β€œExplain how the product works”* +to +βœ… *β€œProve whether this requirement is met, partially met, or not met”* + +--- + +## 🧩 Core Capabilities + +### βœ… RFP Requirement Parsing + +Each question is parsed into a structured requirement: + +```json +{ + "requirement_type": "COMPLIANCE | FUNCTIONAL | NON_FUNCTIONAL", + "subject": "authentication", + "expected_value": "MFA", + "decision_type": "YES_NO | YES_NO_PARTIAL", + "keywords": ["authentication", "mfa", "identity"] +} +``` + +--- + +### 🧠 Knowledge Graph (GraphRAG) + +Facts are extracted **only when explicitly stated** in documentation and stored as graph triples: + +``` +REQUIREMENT -[HAS_METRIC]-> messages per hour +REQUIREMENT -[HAS_VALUE]-> < 1 hour +REQUIREMENT -[SUPPORTED_BY]-> Document section +``` + +This ensures: +- No hallucination +- No inferred assumptions +- Full auditability + +--- + +### πŸ”Ž Hybrid Retrieval Strategy + +1. **Vector Search (FAISS)** +2. **Oracle Graph + Oracle Text** +3. **Graph-aware Re-ranking** + +--- + +### πŸ“Š Deterministic RFP Decision Output + +```json +{ + "answer": "YES | NO | PARTIAL", + "justification": "Short factual explanation", + "evidence": [ + { + "quote": "Exact text from the document", + "source": "Document or section" + } + ] +} +``` + +--- + +## πŸ—οΈ Architecture + +``` +PDFs + └─► Semantic Chunking + └─► FAISS Vector Index + └─► RAG Retrieval + └─► GraphRAG (Oracle 23ai) + └─► Evidence-based LLM Decision + └─► REST API Response +``` + +--- + +## πŸš€ REST API + +### Health Check +GET /health + +### RFP Validation +POST /chat + +```json +{ + "question": "Does the platform support MFA and integration with corporate identity providers?" +} +``` + +--- + +## πŸ§ͺ Example Use Cases + +- Enterprise RFP / RFQ validation +- Pre-sales technical due diligence +- Compliance checks +- SaaS capability assessment +- Audit-ready AI answers + +--- + +## πŸ› οΈ Technology Stack + +- Oracle Autonomous Database 23ai +- OCI Generative AI +- LangChain / LangGraph +- FAISS +- Flask +- Python + +--- + +## πŸ” Design Principles + +- Evidence-first +- Deterministic outputs +- No hallucination tolerance +- Explainability + +--- + +# GraphRAG for RFP Validation – Code Walkthrough + +> **Status:** Demo / Reference Implementation +> **Derived from:** Official Oracle Generative AI & GraphRAG learning material +> https://docs.oracle.com/en/learn/oci-genai-pdf + +--- + +## 🎯 Purpose of This Code + +This code implements a **GraphRAG-based pipeline focused on RFP (Request for Proposal) validation**, not generic Q&A. + +>**Download** the code [graphrag_rerank.py](./files/graphrag_rerank.py) + +The main goal is to: +- Extract **explicit, verifiable facts** from large PDF contracts and datasheets +- Store those facts as **structured graph relationships** +- Answer RFP questions using **YES / NO / PARTIAL** decisions +- Always provide **document-backed evidence**, never hallucinations + +This represents a **strategic shift** from concept-based LLM answers to **compliance-grade validation**. + +--- + +## 🧠 High-Level Architecture + +1. **PDF Ingestion** + - PDFs are read using OCR-aware loaders + - Large documents are split into semantic chunks + +2. **Semantic Chunking (LLM-driven)** + - Headings, tables, metrics, and sections are normalized + - Output is optimized for both vector search and fact extraction + +3. **Vector Index (FAISS)** + - Chunks are embedded using OCI Cohere multilingual embeddings + - Enables semantic recall + +4. **Knowledge Graph (Oracle 23ai)** + - Explicit facts are extracted as triples: + - `REQUIREMENT -[HAS_METRIC]-> RTO` + - `REQUIREMENT -[HAS_VALUE]-> 1 hour` + - Stored in Oracle Property Graph tables + +5. **RFP Requirement Parsing** + - Each user question is converted into a structured requirement: + ```json + { + "requirement_type": "NON_FUNCTIONAL", + "subject": "authentication", + "expected_value": "", + "keywords": ["mfa", "ldap", "sso"] + } + ``` + +6. **Graph + Vector Fusion** + - Graph terms reinforce document reranking + - Ensures high-precision evidence retrieval + +7. **Deterministic RFP Decision** + - LLM outputs are constrained to: + - `YES` + - `NO` + - `PARTIAL` + - Always backed by quotes from source documents + +--- + +## πŸ—‚οΈ Key Code Sections Explained + +### Oracle Autonomous & Graph Setup +- Creates entity and relation tables if not present +- Builds an Oracle **PROPERTY GRAPH** +- Uses Oracle Text indexes for semantic filtering + +### `create_knowledge_graph()` +- Uses LLM to extract **ONLY explicit facts** +- No inference, no assumptions +- Inserts entities and relations safely using MERGE + +### `parse_rfp_requirement()` +- Converts free-text questions into structured RFP requirements +- Enforces strict JSON output using `` tags +- Includes safe fallback logic + +### `query_knowledge_graph()` +- Uses Oracle Text (`CONTAINS`) with sanitized queries +- Filters graph facts by RFP keywords +- Returns only relevant evidence + +### Graph-aware Re-ranking +- Combines: + - Vector similarity + - Graph-derived terms +- Improves precision on contractual questions + +### Final RFP Decision Chain +- Implemented with LangChain `RunnableMap` +- Clean separation of: + - Requirement parsing + - Context retrieval + - Decision generation + +--- + +## βœ… Why This Is NOT a Generic RAG + +| Traditional RAG | This GraphRAG | +|----------------|---------------| +| Answers concepts | Validates requirements | +| May hallucinate | Evidence-only | +| Free-form text | Deterministic YES/NO | +| No structure | Knowledge graph | +| Chatbot | RFP analyst | + +--- + +## ⚠️ Important Design Principles + +- **Evidence-first**: If not explicitly stated β†’ NO +- **No inference**: LLM is forbidden to assume +- **Auditability**: Every answer is traceable +- **Enterprise-grade**: Designed for legal, procurement, compliance + +--- + +## πŸ“Œ Intended Use Cases + +- RFP response automation +- Vendor compliance validation +- Contractual due diligence +- Pre-sales technical qualification +- Regulatory checks + +--- + +## πŸ§ͺ Demo Disclaimer + +This code is: +- A **demo / reference implementation** +- Not production-hardened +- Intended for education, experimentation, and architecture discussions + +--- + +## πŸ‘€ Acknowledgments + +- **Author** - Cristiano Hoshikawa (Oracle LAD A-Team Solution Engineer) + +--- + +## πŸ“Ž References + +[Analyze PDF Documents in Natural Language with OCI Generative AI](https://docs.oracle.com/en/learn/oci-genai-pdf) + +--- + +## ⚠️ Disclaimer + +This is a demo / reference architecture. +Final answers depend strictly on indexed documentation. + diff --git a/files/graphrag_rerank.py b/files/graphrag_rerank.py new file mode 100644 index 0000000..73791e4 --- /dev/null +++ b/files/graphrag_rerank.py @@ -0,0 +1,756 @@ +from langchain_community.chat_models.oci_generative_ai import ChatOCIGenAI +from langchain_core.prompts import PromptTemplate +from langchain.schema.output_parser import StrOutputParser +from langchain_community.embeddings import OCIGenAIEmbeddings +from langchain_community.vectorstores import FAISS +from langchain.schema.runnable import RunnableMap +from langchain_community.document_loaders import UnstructuredPDFLoader, PyMuPDFLoader +from langchain_core.documents import Document +from langchain_core.runnables import RunnableLambda + +from tqdm import tqdm +import os +import pickle +import re +import atexit +import oracledb +import json + +# ========================= +# Oracle Autonomous Configuration +# ========================= +WALLET_PATH = "Wallet_oradb23ai" +DB_ALIAS = "oradb23ai_high" +USERNAME = "admin" +PASSWORD = "**********" +os.environ["TNS_ADMIN"] = WALLET_PATH +GRAPH_NAME = "GRAPH_DB_1" + +oracle_conn = oracledb.connect( + user=USERNAME, + password=PASSWORD, + dsn=DB_ALIAS, + config_dir=WALLET_PATH, + wallet_location=WALLET_PATH, + wallet_password=PASSWORD +) +atexit.register(lambda: oracle_conn.close()) + +# ========================= +# Oracle Graph Client +# ========================= +def create_tables_if_not_exist(conn): + cursor = conn.cursor() + + try: + cursor.execute(f""" + BEGIN + EXECUTE IMMEDIATE ' + CREATE TABLE ENTITIES_{GRAPH_NAME} ( + ID NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY, + NAME VARCHAR2(500) + ) + '; + EXCEPTION + WHEN OTHERS THEN + IF SQLCODE != -955 THEN + RAISE; + END IF; + END; + """) + cursor.execute(f""" + BEGIN + EXECUTE IMMEDIATE ' + CREATE TABLE RELATIONS_{GRAPH_NAME} ( + ID NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY, + SOURCE_ID NUMBER, + TARGET_ID NUMBER, + RELATION_TYPE VARCHAR2(100), + SOURCE_TEXT VARCHAR2(4000) + ) + '; + EXCEPTION + WHEN OTHERS THEN + IF SQLCODE != -955 THEN + RAISE; + END IF; + END; + """) + conn.commit() + print("βœ… ENTITIES and RELATIONS tables created or already exist.") + except Exception as e: + print(f"[ERROR] Failed to create tables: {e}") + finally: + cursor.close() + + +create_tables_if_not_exist(oracle_conn) + +# ========================= +# Global Configurations +# ========================= +INDEX_PATH = "./faiss_index" +PROCESSED_DOCS_FILE = os.path.join(INDEX_PATH, "processed_docs.pkl") +chapter_separator_regex = r"^(#{1,6} .+|\*\*.+\*\*)$" + +# ========================= +# LLM Definitions +# ========================= +llm = ChatOCIGenAI( + model_id="meta.llama-3.1-405b-instruct", + service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com", + compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + auth_profile="DEFAULT", + model_kwargs={"temperature": 0.7, "top_p": 0.75, "max_tokens": 4000}, +) + +llm_for_rag = ChatOCIGenAI( + model_id="meta.llama-3.1-405b-instruct", + service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com", + compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + auth_profile="DEFAULT", +) + +embeddings = OCIGenAIEmbeddings( + model_id="cohere.embed-multilingual-v3.0", + service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com", + compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + auth_profile="DEFAULT", +) + +def create_knowledge_graph(chunks): + cursor = oracle_conn.cursor() + + # Creates graph if it does not exist + try: + cursor.execute(f""" + BEGIN + EXECUTE IMMEDIATE ' + CREATE PROPERTY GRAPH {GRAPH_NAME} + VERTEX TABLES (ENTITIES_{GRAPH_NAME} + KEY (ID) + LABEL ENTITIES + PROPERTIES (NAME)) + EDGE TABLES (RELATIONS_{GRAPH_NAME} + KEY (ID) + SOURCE KEY (SOURCE_ID) REFERENCES ENTITIES_{GRAPH_NAME}(ID) + DESTINATION KEY (TARGET_ID) REFERENCES ENTITIES_{GRAPH_NAME}(ID) + LABEL RELATIONS + PROPERTIES (RELATION_TYPE, SOURCE_TEXT)) + '; + EXCEPTION + WHEN OTHERS THEN + IF SQLCODE != -55358 THEN -- ORA-55358: Graph already exists + RAISE; + END IF; + END; + """) + print(f"🧠 Graph '{GRAPH_NAME}' created or already exists.") + except Exception as e: + print(f"[GRAPH ERROR] Failed to create graph: {e}") + + # Inserting vertices and edges into the tables + for doc in chunks: + text = doc.page_content + source = doc.metadata.get("source", "unknown") + + if not text.strip(): + continue + + prompt = f""" + You are extracting structured RFP evidence from technical documentation. + + Given the text below, identify ONLY explicit, verifiable facts. + + Text: + {text} + + Extract triples in ONE of the following formats ONLY: + + 1. REQUIREMENT -[HAS_SUBJECT]-> + 2. REQUIREMENT -[HAS_METRIC]-> + 3. REQUIREMENT -[HAS_VALUE]-> + 4. REQUIREMENT -[SUPPORTED_BY]-> + + Rules: + - Use REQUIREMENT as the source entity + - Use UPPERCASE relation names + - Do NOT infer or assume + - If nothing explicit is found, return NONE + """ + try: + response = llm_for_rag.invoke(prompt) + result = response.content.strip() + except Exception as e: + print(f"[ERROR] Gen AI call error: {e}") + continue + + if result.upper() == "NONE": + continue + + triples = result.splitlines() + for triple in triples: + parts = triple.split("-[") + if len(parts) != 2: + continue + + right_part = parts[1].split("]->") + if len(right_part) != 2: + continue + + raw_relation, entity2 = right_part + relation = re.sub(r'\W+', '_', raw_relation.strip().upper()) + entity1 = parts[0].strip() + entity2 = entity2.strip() + + if entity1.upper() != "REQUIREMENT": + entity1 = "REQUIREMENT" + + try: + # Insertion of entities (with existence check) + cursor.execute(f"MERGE INTO ENTITIES_{GRAPH_NAME} e USING (SELECT :name AS NAME FROM dual) src ON (e.name = src.name) WHEN NOT MATCHED THEN INSERT (NAME) VALUES (:name)", [entity1, entity1]) + cursor.execute(f"MERGE INTO ENTITIES_{GRAPH_NAME} e USING (SELECT :name AS NAME FROM dual) src ON (e.name = src.name) WHEN NOT MATCHED THEN INSERT (NAME) VALUES (:name)", [entity2, entity2]) + # Retrieve the IDs + cursor.execute(f"SELECT ID FROM ENTITIES_{GRAPH_NAME} WHERE NAME = :name", [entity1]) + source_id = cursor.fetchone()[0] + cursor.execute(f"SELECT ID FROM ENTITIES_{GRAPH_NAME} WHERE NAME = :name", [entity2]) + target_id = cursor.fetchone()[0] + # Create relations + cursor.execute(f""" + INSERT INTO RELATIONS_{GRAPH_NAME} (SOURCE_ID, TARGET_ID, RELATION_TYPE, SOURCE_TEXT) + VALUES (:src, :tgt, :rel, :txt) + """, [source_id, target_id, relation, source]) + print(f"βœ… {entity1} -[{relation}]-> {entity2}") + except Exception as e: + print(f"[INSERT ERROR] {e}") + + oracle_conn.commit() + cursor.close() + print("πŸ’Ύ Knowledge graph updated.") + +def parse_rfp_requirement(question: str) -> dict: + prompt = f""" +You are an RFP requirement extractor. + +Return the result STRICTLY between the tags and . +Do NOT write anything outside these tags. + +Question: +{question} + + +{{ + "requirement_type": "COMPLIANCE | FUNCTIONAL | NON_FUNCTIONAL", + "subject": "", + "expected_value": "", + "decision_type": "YES_NO | YES_NO_PARTIAL", + "keywords": ["keyword1", "keyword2"] +}} + +""" + + resp = llm_for_rag.invoke(prompt) + raw = resp.content.strip() + + try: + # remove ```json ``` ou ``` ``` + raw = re.sub(r"```json|```", "", raw).strip() + + match = re.search(r"\s*(\{.*?\})\s*", raw, re.DOTALL) + if not match: + raise ValueError("No JSON block found") + json_text = match.group(1) + + return json.loads(json_text) + + except Exception as e: + print("⚠️ RFP PARSER FAILED") + print("RAW RESPONSE:") + print(raw) + + return { + "requirement_type": "UNKNOWN", + "subject": question, + "expected_value": "", + "decision_type": "YES_NO_PARTIAL", + "keywords": re.findall(r"\b\w+\b", question.lower())[:5] + } + +def extract_graph_keywords_from_requirement(req: dict) -> str: + keywords = set(req.get("keywords", [])) + if req.get("subject"): + keywords.add(req["subject"].lower()) + if req.get("expected_value"): + keywords.add(str(req["expected_value"]).lower()) + return ", ".join(sorted(keywords)) + +def build_oracle_text_query(text: str) -> str | None: + ORACLE_TEXT_STOPWORDS = { + "and", "or", "the", "with", "between", "of", "to", "for", + "in", "on", "by", "is", "are", "was", "were", "be" + } + + tokens = [] + text = text.lower() + text = re.sub(r"[^a-z0-9\s]", " ", text) + + for token in text.split(): + if len(token) >= 4 and token not in ORACLE_TEXT_STOPWORDS: + tokens.append(f"{token}") + + tokens = sorted(set(tokens)) + return " OR ".join(tokens) if tokens else None + +def query_knowledge_graph(raw_keywords: str): + cursor = oracle_conn.cursor() + + safe_query = build_oracle_text_query(raw_keywords) + + base_sql = f""" + SELECT + e1.NAME AS source_name, + r.RELATION_TYPE, + e2.NAME AS target_name + FROM RELATIONS_{GRAPH_NAME} r + JOIN ENTITIES_{GRAPH_NAME} e1 ON e1.ID = r.SOURCE_ID + JOIN ENTITIES_{GRAPH_NAME} e2 ON e2.ID = r.TARGET_ID + WHERE e1.NAME = 'REQUIREMENT' + """ + + if safe_query: + base_sql += f""" + AND ( + CONTAINS(e2.NAME, '{safe_query}') > 0 + OR CONTAINS(r.RELATION_TYPE, '{safe_query}') > 0 + ) + """ + + print("πŸ”Ž GRAPH QUERY:") + print(base_sql) + + cursor.execute(base_sql) + rows = cursor.fetchall() + cursor.close() + + print("πŸ“Š GRAPH FACTS:") + for s, r, t in rows: + print(f" REQUIREMENT -[{r}]-> {t}") + + return rows + +# RE-RANK + +def extract_terms_from_graph_text(graph_context): + if not graph_context: + return set() + + if isinstance(graph_context, list): + terms = set() + for row in graph_context: + for col in row: + if isinstance(col, str): + terms.add(col.lower()) + return terms + + if isinstance(graph_context, str): + terms = set() + pattern = re.findall(r"([\w\s]+)-$begin:math:display$\[\\w\_\]\+$end:math:display$->([\w\s]+)", graph_context) + for e1, e2 in pattern: + terms.add(e1.strip().lower()) + terms.add(e2.strip().lower()) + return terms + + return set() + +def rerank_documents_with_graph_terms(docs, query, graph_terms): + query_terms = set(re.findall(r'\b\w+\b', query.lower())) + all_terms = query_terms.union(graph_terms) + + scored_docs = [] + for doc in docs: + doc_text = doc.page_content.lower() + score = sum(1 for term in all_terms if term in doc_text) + scored_docs.append((score, doc)) + + top_docs = sorted(scored_docs, key=lambda x: x[0], reverse=True)[:5] + return [doc.page_content for _, doc in top_docs] + +# SEMANTIC CHUNKING + +def split_llm_output_into_chapters(llm_text): + chapters = [] + current_chapter = [] + lines = llm_text.splitlines() + + for line in lines: + if re.match(chapter_separator_regex, line): + if current_chapter: + chapters.append("\n".join(current_chapter).strip()) + current_chapter = [line] + else: + current_chapter.append(line) + + if current_chapter: + chapters.append("\n".join(current_chapter).strip()) + + return chapters + + +def semantic_chunking(text): + prompt = f""" + You received the following text extracted via OCR: + + {text} + + Your task: + 1. Identify headings (short uppercase or bold lines, no period at the end) putting the Product Name (Application Name) and the Subject + 2. Separate paragraphs by heading + 3. Indicate columns with [COLUMN 1], [COLUMN 2] if present + 4. Indicate tables with [TABLE] in markdown format + 5. Indicate explicity metrics (if it exists) + Examples: + - Oracle Financial Services RTO is 1 hour + - The Oracle Banking Supply Chain Finance Cloud Service A maximum number of 10K Hosted Transactions + - The Oracle Banking Payments Cloud Service, Additional Non-Production Environment: You may purchase up to a maximum of ten (10) additional Non-Production Environments + """ + + get_out = False + while not get_out: + try: + response = llm_for_rag.invoke(prompt) + get_out = True + except: + print("[ERROR] Gen AI call error") + + return response + + +def read_pdfs(pdf_path): + if "-ocr" in pdf_path: + doc_pages = PyMuPDFLoader(str(pdf_path)).load() + else: + doc_pages = UnstructuredPDFLoader(str(pdf_path)).load() + full_text = "\n".join([page.page_content for page in doc_pages]) + return full_text + + +def smart_split_text(text, max_chunk_size=10_000): + chunks = [] + start = 0 + text_length = len(text) + + while start < text_length: + end = min(start + max_chunk_size, text_length) + split_point = max( + text.rfind('.', start, end), + text.rfind('!', start, end), + text.rfind('?', start, end), + text.rfind('\n\n', start, end) + ) + if split_point == -1 or split_point <= start: + split_point = end + else: + split_point += 1 + + chunk = text[start:split_point].strip() + if chunk: + chunks.append(chunk) + + start = split_point + + return chunks + + +def load_previously_indexed_docs(): + if os.path.exists(PROCESSED_DOCS_FILE): + with open(PROCESSED_DOCS_FILE, "rb") as f: + return pickle.load(f) + return set() + + +def save_indexed_docs(docs): + with open(PROCESSED_DOCS_FILE, "wb") as f: + pickle.dump(docs, f) + + +# ========================= +# Main Function +# ========================= +def chat(): + pdf_paths = ['FSGIU+OBCS+SD+121125+FINAL.pdf'] + + already_indexed_docs = load_previously_indexed_docs() + updated_docs = set() + + try: + vectorstore = FAISS.load_local(INDEX_PATH, embeddings, allow_dangerous_deserialization=True) + print("βœ”οΈ FAISS index loaded.") + except Exception: + print("⚠️ FAISS index not found, creating a new one.") + vectorstore = None + + new_chunks = [] + + for pdf_path in tqdm(pdf_paths, desc=f"πŸ“„ Processing PDFs"): + print(f" {os.path.basename(pdf_path)}") + if pdf_path in already_indexed_docs: + print(f"βœ… Document already indexed: {pdf_path}") + continue + full_text = read_pdfs(pdf_path=pdf_path) + + text_chunks = smart_split_text(full_text, max_chunk_size=10_000) + overflow_buffer = "" + + for chunk in tqdm(text_chunks, desc=f"πŸ“„ Processing text chunks", dynamic_ncols=True, leave=False): + current_text = overflow_buffer + chunk + + treated_text = semantic_chunking(current_text) + + if hasattr(treated_text, "content"): + chapters = split_llm_output_into_chapters(treated_text.content) + + last_chapter = chapters[-1] if chapters else "" + + if last_chapter and not last_chapter.strip().endswith((".", "!", "?", "\n\n")): + print("πŸ“Œ Last chapter seems incomplete, saving for the next cycle") + overflow_buffer = last_chapter + chapters = chapters[:-1] + else: + overflow_buffer = "" + + for chapter_text in chapters: + doc = Document(page_content=chapter_text, metadata={"source": pdf_path}) + new_chunks.append(doc) + print(f"βœ… New chapter indexed:\n{chapter_text}...\n") + + else: + print(f"[ERROR] semantic_chunking returned unexpected type: {type(treated_text)}") + + updated_docs.add(str(pdf_path)) + + if new_chunks: + if vectorstore: + vectorstore.add_documents(new_chunks) + else: + vectorstore = FAISS.from_documents(new_chunks, embedding=embeddings) + + vectorstore.save_local(INDEX_PATH) + save_indexed_docs(already_indexed_docs.union(updated_docs)) + print(f"πŸ’Ύ {len(new_chunks)} chunks added to FAISS index.") + + print("🧠 Building knowledge graph...") + create_knowledge_graph(new_chunks) + + else: + print("πŸ“ No new documents to index.") + + retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 50, "fetch_k": 100}) + + RFP_DECISION_TEMPLATE = """ + You are answering an RFP. + + Requirement: + Type: {requirement_type} + Subject: {subject} + Expected value: {expected_value} + + Document evidence: + {text_context} + + Graph evidence: + {graph_context} + + Rules: + - Answer ONLY with YES, NO or PARTIAL + - Do NOT assume anything not explicitly stated + - If value differs, answer PARTIAL + - If not found, answer NO + + Return ONLY valid JSON: + {{ + "answer": "YES | NO | PARTIAL", + "justification": "", + "evidence": [ + {{ + "quote": "", + "source": "" + }} + ] + }} + """ + prompt = PromptTemplate.from_template(RFP_DECISION_TEMPLATE) + + def get_context(x): + query = x.get("input") if isinstance(x, dict) else x + + # 1. Recupera chunks vetoriais normalmente + docs = retriever.invoke(query) + + req = parse_rfp_requirement(query) + query_terms = extract_graph_keywords_from_requirement(req) + graph_context = query_knowledge_graph(query_terms) + + graph_terms = extract_terms_from_graph_text(graph_context) + + reranked_chunks = rerank_documents_with_graph_terms(docs, query, graph_terms) + + return "\n\n".join(reranked_chunks) + + def get_context_from_requirement(req: dict): + query_terms = extract_graph_keywords_from_requirement(req) + + docs = retriever.invoke(query_terms) + graph_context = query_knowledge_graph(query_terms) + + return { + "text_context": "\n\n".join(doc.page_content for doc in docs), + "graph_context": graph_context, + "requirement_type": req["requirement_type"], + "subject": req["subject"], + "expected_value": req.get("expected_value", "") + } + + parse_requirement_runnable = RunnableLambda( + lambda q: parse_rfp_requirement(q) + ) + chain = ( + parse_requirement_runnable + | RunnableMap({ + "text_context": RunnableLambda( + lambda req: get_context_from_requirement(req)["text_context"] + ), + "graph_context": RunnableLambda( + lambda req: get_context_from_requirement(req)["graph_context"] + ), + "requirement_type": lambda req: req["requirement_type"], + "subject": lambda req: req["subject"], + "expected_value": lambda req: req.get("expected_value", "") + }) + | prompt + | llm + | StrOutputParser() + ) + + print("βœ… READY") + + while True: + query = input("❓ Question (or 'quit' to exit): ") + if query.lower() == "quit": + break + response = chain.invoke(query) + print("\nπŸ“œ RESPONSE:\n") + print(response) + print("\n" + "=" * 80 + "\n") + +def get_context(x): + query = x.get("input") if isinstance(x, dict) else x + + docs = retriever.invoke(query) + + req = parse_rfp_requirement(query) + query_terms = extract_graph_keywords_from_requirement(req) + graph_context = query_knowledge_graph(query_terms) + + graph_terms = extract_terms_from_graph_text(graph_context) + + reranked_chunks = rerank_documents_with_graph_terms(docs, query, graph_terms) + + return "\n\n".join(reranked_chunks) + +def get_context_from_requirement(req: dict): + query_terms = extract_graph_keywords_from_requirement(req) + + docs = retriever.invoke(query_terms) + graph_context = query_knowledge_graph(query_terms) + + graph_terms = extract_terms_from_graph_text(graph_context) + reranked_chunks = rerank_documents_with_graph_terms( + docs, + query_terms, + graph_terms + ) + + return { + "text_context": "\n\n".join(reranked_chunks), + "graph_context": graph_context, + "requirement_type": req["requirement_type"], + "subject": req["subject"], + "expected_value": req.get("expected_value", "") + } + +try: + vectorstore = FAISS.load_local( + INDEX_PATH, + embeddings, + allow_dangerous_deserialization=True + ) + + retriever = vectorstore.as_retriever( + search_type="similarity", + search_kwargs={"k": 50, "fetch_k": 100} + ) +except: + print("No Faiss") + +RFP_DECISION_TEMPLATE = """ +You are answering an RFP. + +Requirement: +Type: {requirement_type} +Subject: {subject} +Expected value: {expected_value} + +Document evidence: +{text_context} + +Graph evidence: +{graph_context} + +Rules: +- Answer ONLY with YES, NO or PARTIAL +- Do NOT assume anything not explicitly stated +- If value differs, answer PARTIAL +- If not found, answer NO + +Return ONLY valid JSON: +{{ + "answer": "YES | NO | PARTIAL", + "justification": "", + "evidence": [ + {{ + "quote": "", + "source": "" + }} + ] +}} +""" +prompt = PromptTemplate.from_template(RFP_DECISION_TEMPLATE) + +parse_requirement_runnable = RunnableLambda( + lambda q: parse_rfp_requirement(q) +) + +chain = ( + parse_requirement_runnable + | RunnableMap({ + "text_context": RunnableLambda( + lambda req: get_context_from_requirement(req)["text_context"] + ), + "graph_context": RunnableLambda( + lambda req: get_context_from_requirement(req)["graph_context"] + ), + "requirement_type": lambda req: req["requirement_type"], + "subject": lambda req: req["subject"], + "expected_value": lambda req: req.get("expected_value", "") +}) + | prompt + | llm + | StrOutputParser() +) + +def answer_question(question: str) -> str: + return chain.invoke(question) + +# πŸš€ Run +if __name__ == "__main__": + chat() \ No newline at end of file