from langchain_community.chat_models.oci_generative_ai import ChatOCIGenAI from langchain_core.prompts import PromptTemplate from langchain.schema.output_parser import StrOutputParser from langchain_community.embeddings import OCIGenAIEmbeddings from langchain_community.vectorstores import FAISS from langchain.schema.runnable import RunnableMap from langchain_community.document_loaders import UnstructuredPDFLoader, PyMuPDFLoader from langchain_core.documents import Document from langchain_core.runnables import RunnableLambda from tqdm import tqdm import os import pickle import re import atexit import oracledb import json # ========================= # Oracle Autonomous Configuration # ========================= WALLET_PATH = "Wallet_oradb23ai" DB_ALIAS = "oradb23ai_high" USERNAME = "admin" PASSWORD = "**********" os.environ["TNS_ADMIN"] = WALLET_PATH GRAPH_NAME = "GRAPH_DB_1" # ========================= # Global Configurations # ========================= INDEX_PATH = "./faiss_index" PROCESSED_DOCS_FILE = os.path.join(INDEX_PATH, "processed_docs.pkl") chapter_separator_regex = r"^(#{1,6} .+|\*\*.+\*\*)$" pdf_paths = ['.pdf'] # ========================= # LLM Definitions # ========================= llm = ChatOCIGenAI( model_id="meta.llama-3.1-405b-instruct", service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com", compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", auth_profile="DEFAULT", model_kwargs={"temperature": 0.7, "top_p": 0.75, "max_tokens": 4000}, ) llm_for_rag = ChatOCIGenAI( model_id="meta.llama-3.1-405b-instruct", service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com", compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", auth_profile="DEFAULT", ) embeddings = OCIGenAIEmbeddings( model_id="cohere.embed-multilingual-v3.0", service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com", compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", auth_profile="DEFAULT", ) oracle_conn = oracledb.connect( user=USERNAME, password=PASSWORD, dsn=DB_ALIAS, config_dir=WALLET_PATH, wallet_location=WALLET_PATH, wallet_password=PASSWORD ) atexit.register(lambda: oracle_conn.close()) # ========================= # Oracle Graph Client # ========================= def ensure_oracle_text_index( conn, table_name: str, column_name: str, index_name: str ): """ Ensure an Oracle Text (CTXSYS.CONTEXT) index exists and is synchronized for a given table and column. """ cursor = conn.cursor() # 1. Verifica se o Γ­ndice jΓ‘ existe cursor.execute(""" SELECT COUNT(*) FROM user_indexes WHERE index_name = :idx_name """, {"idx_name": index_name.upper()}) exists = cursor.fetchone()[0] > 0 if not exists: print(f"πŸ› οΈ Creating Oracle Text index {index_name} on {table_name}.{column_name}") cursor.execute(f""" CREATE INDEX {index_name} ON {table_name} ({column_name}) INDEXTYPE IS CTXSYS.CONTEXT """) else: print(f"βœ”οΈ Oracle Text index already exists: {index_name}") # 2. Sincroniza o Γ­ndice (importante se dados jΓ‘ existirem) print(f"πŸ”„ Syncing Oracle Text index: {index_name}") cursor.execute(f""" BEGIN CTX_DDL.SYNC_INDEX('{index_name}'); END; """) conn.commit() cursor.close() def create_tables_if_not_exist(conn): cursor = conn.cursor() try: cursor.execute(f""" BEGIN EXECUTE IMMEDIATE ' CREATE TABLE ENTITIES_{GRAPH_NAME} ( ID NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY, NAME VARCHAR2(500) ) '; EXCEPTION WHEN OTHERS THEN IF SQLCODE != -955 THEN RAISE; END IF; END; """) cursor.execute(f""" BEGIN EXECUTE IMMEDIATE ' CREATE TABLE RELATIONS_{GRAPH_NAME} ( ID NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY, SOURCE_ID NUMBER, TARGET_ID NUMBER, RELATION_TYPE VARCHAR2(100), SOURCE_TEXT VARCHAR2(4000) ) '; EXCEPTION WHEN OTHERS THEN IF SQLCODE != -955 THEN RAISE; END IF; END; """) conn.commit() print("βœ… ENTITIES and RELATIONS tables created or already exist.") except Exception as e: print(f"[ERROR] Failed to create tables: {e}") finally: cursor.close() create_tables_if_not_exist(oracle_conn) ensure_oracle_text_index( oracle_conn, "ENTITIES_" + GRAPH_NAME, "NAME", "IDX_ENT_" + GRAPH_NAME + "_NAME" ) ensure_oracle_text_index( oracle_conn, "RELATIONS_" + GRAPH_NAME, "RELATION_TYPE", "IDX_REL_" + GRAPH_NAME + "_RELTYPE" ) def create_knowledge_graph(chunks): cursor = oracle_conn.cursor() # Creates graph if it does not exist try: cursor.execute(f""" BEGIN EXECUTE IMMEDIATE ' CREATE PROPERTY GRAPH {GRAPH_NAME} VERTEX TABLES (ENTITIES_{GRAPH_NAME} KEY (ID) LABEL ENTITIES PROPERTIES (NAME)) EDGE TABLES (RELATIONS_{GRAPH_NAME} KEY (ID) SOURCE KEY (SOURCE_ID) REFERENCES ENTITIES_{GRAPH_NAME}(ID) DESTINATION KEY (TARGET_ID) REFERENCES ENTITIES_{GRAPH_NAME}(ID) LABEL RELATIONS PROPERTIES (RELATION_TYPE, SOURCE_TEXT)) '; EXCEPTION WHEN OTHERS THEN IF SQLCODE != -55358 THEN -- ORA-55358: Graph already exists RAISE; END IF; END; """) print(f"🧠 Graph '{GRAPH_NAME}' created or already exists.") except Exception as e: print(f"[GRAPH ERROR] Failed to create graph: {e}") # Inserting vertices and edges into the tables for doc in chunks: text = doc.page_content source = doc.metadata.get("source", "unknown") if not text.strip(): continue prompt = f""" You are extracting structured RFP evidence from technical documentation. Given the text below, identify ONLY explicit, verifiable facts. Text: {text} Extract triples in ONE of the following formats ONLY: 1. REQUIREMENT -[HAS_SUBJECT]-> 2. REQUIREMENT -[HAS_METRIC]-> 3. REQUIREMENT -[HAS_VALUE]-> 4. REQUIREMENT -[SUPPORTED_BY]-> Rules: - Use REQUIREMENT as the source entity - Use UPPERCASE relation names - Do NOT infer or assume - If nothing explicit is found, return NONE """ try: response = llm_for_rag.invoke(prompt) result = response.content.strip() except Exception as e: print(f"[ERROR] Gen AI call error: {e}") continue if result.upper() == "NONE": continue triples = result.splitlines() for triple in triples: parts = triple.split("-[") if len(parts) != 2: continue right_part = parts[1].split("]->") if len(right_part) != 2: continue raw_relation, entity2 = right_part relation = re.sub(r'\W+', '_', raw_relation.strip().upper()) entity1 = parts[0].strip() entity2 = entity2.strip() if entity1.upper() != "REQUIREMENT": entity1 = "REQUIREMENT" try: # Insertion of entities (with existence check) cursor.execute(f"MERGE INTO ENTITIES_{GRAPH_NAME} e USING (SELECT :name AS NAME FROM dual) src ON (e.name = src.name) WHEN NOT MATCHED THEN INSERT (NAME) VALUES (:name)", [entity1, entity1]) cursor.execute(f"MERGE INTO ENTITIES_{GRAPH_NAME} e USING (SELECT :name AS NAME FROM dual) src ON (e.name = src.name) WHEN NOT MATCHED THEN INSERT (NAME) VALUES (:name)", [entity2, entity2]) # Retrieve the IDs cursor.execute(f"SELECT ID FROM ENTITIES_{GRAPH_NAME} WHERE NAME = :name", [entity1]) source_id = cursor.fetchone()[0] cursor.execute(f"SELECT ID FROM ENTITIES_{GRAPH_NAME} WHERE NAME = :name", [entity2]) target_id = cursor.fetchone()[0] # Create relations cursor.execute(f""" INSERT INTO RELATIONS_{GRAPH_NAME} (SOURCE_ID, TARGET_ID, RELATION_TYPE, SOURCE_TEXT) VALUES (:src, :tgt, :rel, :txt) """, [source_id, target_id, relation, source]) print(f"βœ… {entity1} -[{relation}]-> {entity2}") except Exception as e: print(f"[INSERT ERROR] {e}") oracle_conn.commit() cursor.close() print("πŸ’Ύ Knowledge graph updated.") def parse_rfp_requirement(question: str) -> dict: prompt = f""" You are an RFP requirement extractor. Return the result STRICTLY between the tags and . Do NOT write anything outside these tags. Question: {question} {{ "requirement_type": "COMPLIANCE | FUNCTIONAL | NON_FUNCTIONAL", "subject": "", "expected_value": "", "decision_type": "YES_NO | YES_NO_PARTIAL", "keywords": ["keyword1", "keyword2"] }} """ resp = llm_for_rag.invoke(prompt) raw = resp.content.strip() try: # remove ```json ``` ou ``` ``` raw = re.sub(r"```json|```", "", raw).strip() match = re.search(r"\s*(\{.*?\})\s*", raw, re.DOTALL) if not match: raise ValueError("No JSON block found") json_text = match.group(1) return json.loads(json_text) except Exception as e: print("⚠️ RFP PARSER FAILED") print("RAW RESPONSE:") print(raw) return { "requirement_type": "UNKNOWN", "subject": question, "expected_value": "", "decision_type": "YES_NO_PARTIAL", "keywords": re.findall(r"\b\w+\b", question.lower())[:5] } def extract_graph_keywords_from_requirement(req: dict) -> str: keywords = set(req.get("keywords", [])) if req.get("subject"): keywords.add(req["subject"].lower()) if req.get("expected_value"): keywords.add(str(req["expected_value"]).lower()) return ", ".join(sorted(keywords)) def build_oracle_text_query(text: str) -> str | None: ORACLE_TEXT_STOPWORDS = { "and", "or", "the", "with", "between", "of", "to", "for", "in", "on", "by", "is", "are", "was", "were", "be" } tokens = [] text = text.lower() text = re.sub(r"[^a-z0-9\s]", " ", text) for token in text.split(): if len(token) >= 4 and token not in ORACLE_TEXT_STOPWORDS: tokens.append(f"{token}") tokens = sorted(set(tokens)) return " OR ".join(tokens) if tokens else None def query_knowledge_graph(raw_keywords: str): cursor = oracle_conn.cursor() safe_query = build_oracle_text_query(raw_keywords) base_sql = f""" SELECT e1.NAME AS source_name, r.RELATION_TYPE, e2.NAME AS target_name FROM RELATIONS_{GRAPH_NAME} r JOIN ENTITIES_{GRAPH_NAME} e1 ON e1.ID = r.SOURCE_ID JOIN ENTITIES_{GRAPH_NAME} e2 ON e2.ID = r.TARGET_ID WHERE e1.NAME = 'REQUIREMENT' """ if safe_query: base_sql += f""" AND ( CONTAINS(e2.NAME, '{safe_query}') > 0 OR CONTAINS(r.RELATION_TYPE, '{safe_query}') > 0 ) """ print("πŸ”Ž GRAPH QUERY:") print(base_sql) cursor.execute(base_sql) rows = cursor.fetchall() cursor.close() print("πŸ“Š GRAPH FACTS:") for s, r, t in rows: print(f" REQUIREMENT -[{r}]-> {t}") return rows # RE-RANK def extract_terms_from_graph_text(graph_context): if not graph_context: return set() if isinstance(graph_context, list): terms = set() for row in graph_context: for col in row: if isinstance(col, str): terms.add(col.lower()) return terms if isinstance(graph_context, str): terms = set() pattern = re.findall(r"([\w\s]+)-$begin:math:display$\[\\w\_\]\+$end:math:display$->([\w\s]+)", graph_context) for e1, e2 in pattern: terms.add(e1.strip().lower()) terms.add(e2.strip().lower()) return terms return set() def rerank_documents_with_graph_terms(docs, query, graph_terms): query_terms = set(re.findall(r'\b\w+\b', query.lower())) all_terms = query_terms.union(graph_terms) scored_docs = [] for doc in docs: doc_text = doc.page_content.lower() score = sum(1 for term in all_terms if term in doc_text) scored_docs.append((score, doc)) top_docs = sorted(scored_docs, key=lambda x: x[0], reverse=True)[:5] return [doc.page_content for _, doc in top_docs] # SEMANTIC CHUNKING def split_llm_output_into_chapters(llm_text): chapters = [] current_chapter = [] lines = llm_text.splitlines() for line in lines: if re.match(chapter_separator_regex, line): if current_chapter: chapters.append("\n".join(current_chapter).strip()) current_chapter = [line] else: current_chapter.append(line) if current_chapter: chapters.append("\n".join(current_chapter).strip()) return chapters def semantic_chunking(text): prompt = f""" You received the following text extracted via OCR: {text} Your task: 1. Identify headings (short uppercase or bold lines, no period at the end) putting the Product Name (Application Name) and the Subject 2. Separate paragraphs by heading 3. Indicate columns with [COLUMN 1], [COLUMN 2] if present 4. Indicate tables with [TABLE] in markdown format 5. Indicate explicity metrics (if it exists) Examples: - Oracle Financial Services RTO is 1 hour - The Oracle Banking Supply Chain Finance Cloud Service A maximum number of 10K Hosted Transactions - The Oracle Banking Payments Cloud Service, Additional Non-Production Environment: You may purchase up to a maximum of ten (10) additional Non-Production Environments """ get_out = False while not get_out: try: response = llm_for_rag.invoke(prompt) get_out = True except: print("[ERROR] Gen AI call error") return response def read_pdfs(pdf_path): if "-ocr" in pdf_path: doc_pages = PyMuPDFLoader(str(pdf_path)).load() else: doc_pages = UnstructuredPDFLoader(str(pdf_path)).load() full_text = "\n".join([page.page_content for page in doc_pages]) return full_text def smart_split_text(text, max_chunk_size=10_000): chunks = [] start = 0 text_length = len(text) while start < text_length: end = min(start + max_chunk_size, text_length) split_point = max( text.rfind('.', start, end), text.rfind('!', start, end), text.rfind('?', start, end), text.rfind('\n\n', start, end) ) if split_point == -1 or split_point <= start: split_point = end else: split_point += 1 chunk = text[start:split_point].strip() if chunk: chunks.append(chunk) start = split_point return chunks def load_previously_indexed_docs(): if os.path.exists(PROCESSED_DOCS_FILE): with open(PROCESSED_DOCS_FILE, "rb") as f: return pickle.load(f) return set() def save_indexed_docs(docs): with open(PROCESSED_DOCS_FILE, "wb") as f: pickle.dump(docs, f) # ========================= # Main Function # ========================= def chat(): pdf_paths = ['RFP - Financial v2.pdf'] already_indexed_docs = load_previously_indexed_docs() updated_docs = set() try: vectorstore = FAISS.load_local(INDEX_PATH, embeddings, allow_dangerous_deserialization=True) print("βœ”οΈ FAISS index loaded.") except Exception: print("⚠️ FAISS index not found, creating a new one.") vectorstore = None new_chunks = [] for pdf_path in tqdm(pdf_paths, desc=f"πŸ“„ Processing PDFs"): print(f" {os.path.basename(pdf_path)}") if pdf_path in already_indexed_docs: print(f"βœ… Document already indexed: {pdf_path}") continue full_text = read_pdfs(pdf_path=pdf_path) text_chunks = smart_split_text(full_text, max_chunk_size=10_000) overflow_buffer = "" for chunk in tqdm(text_chunks, desc=f"πŸ“„ Processing text chunks", dynamic_ncols=True, leave=False): current_text = overflow_buffer + chunk treated_text = semantic_chunking(current_text) if hasattr(treated_text, "content"): chapters = split_llm_output_into_chapters(treated_text.content) last_chapter = chapters[-1] if chapters else "" if last_chapter and not last_chapter.strip().endswith((".", "!", "?", "\n\n")): print("πŸ“Œ Last chapter seems incomplete, saving for the next cycle") overflow_buffer = last_chapter chapters = chapters[:-1] else: overflow_buffer = "" for chapter_text in chapters: doc = Document(page_content=chapter_text, metadata={"source": pdf_path}) new_chunks.append(doc) print(f"βœ… New chapter indexed:\n{chapter_text}...\n") else: print(f"[ERROR] semantic_chunking returned unexpected type: {type(treated_text)}") updated_docs.add(str(pdf_path)) if new_chunks: if vectorstore: vectorstore.add_documents(new_chunks) else: vectorstore = FAISS.from_documents(new_chunks, embedding=embeddings) vectorstore.save_local(INDEX_PATH) save_indexed_docs(already_indexed_docs.union(updated_docs)) print(f"πŸ’Ύ {len(new_chunks)} chunks added to FAISS index.") print("🧠 Building knowledge graph...") create_knowledge_graph(new_chunks) else: print("πŸ“ No new documents to index.") retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 50, "fetch_k": 100}) RFP_DECISION_TEMPLATE = """ You are answering an RFP requirement with risk awareness. Requirement: Type: {requirement_type} Subject: {subject} Expected value: {expected_value} Document evidence: {text_context} Graph evidence: {graph_context} Decision rules: - Answer ONLY with YES, NO or PARTIAL - Do NOT assume anything not explicitly stated - If value differs, answer PARTIAL - If not found, answer NO Confidence rules: - HIGH: Explicit evidence directly answers the requirement - MEDIUM: Evidence partially matches or requires light interpretation - LOW: Requirement is ambiguous OR evidence is indirect OR missing Ambiguity rules: - ambiguity_detected = true if: - The requirement can be interpreted in more than one way - Keywords are vague (e.g. "support", "integration", "capability") - Evidence does not clearly bind to subject + expected value OUTPUT CONSTRAINTS (MANDATORY): - Return ONLY a valid JSON object - Do NOT include explanations, comments, markdown, lists, or code fences - Do NOT write any text before or after the JSON - The response must start with an opening curly brace and end with a closing curly brace JSON schema (return exactly this structure): {{ "answer": "YES | NO | PARTIAL", "confidence": "HIGH | MEDIUM | LOW", "ambiguity_detected": true, "confidence_reason": "", "justification": "", "evidence": [ {{ "quote": "", "source": "" }} ] }} """ prompt = PromptTemplate.from_template(RFP_DECISION_TEMPLATE) def get_context(x): query = x.get("input") if isinstance(x, dict) else x # 1. Recupera chunks vetoriais normalmente docs = retriever.invoke(query) req = parse_rfp_requirement(query) query_terms = extract_graph_keywords_from_requirement(req) graph_context = query_knowledge_graph(query_terms) graph_terms = extract_terms_from_graph_text(graph_context) reranked_chunks = rerank_documents_with_graph_terms(docs, query, graph_terms) return "\n\n".join(reranked_chunks) def get_context_from_requirement(req: dict): query_terms = extract_graph_keywords_from_requirement(req) docs = retriever.invoke(query_terms) graph_context = query_knowledge_graph(query_terms) return { "text_context": "\n\n".join(doc.page_content for doc in docs), "graph_context": graph_context, "requirement_type": req["requirement_type"], "subject": req["subject"], "expected_value": req.get("expected_value", "") } parse_requirement_runnable = RunnableLambda( lambda q: parse_rfp_requirement(q) ) chain = ( parse_requirement_runnable | RunnableMap({ "text_context": RunnableLambda( lambda req: get_context_from_requirement(req)["text_context"] ), "graph_context": RunnableLambda( lambda req: get_context_from_requirement(req)["graph_context"] ), "requirement_type": lambda req: req["requirement_type"], "subject": lambda req: req["subject"], "expected_value": lambda req: req.get("expected_value", "") }) | prompt | llm | StrOutputParser() ) print("βœ… READY") while True: query = input("❓ Question (or 'quit' to exit): ") if query.lower() == "quit": break response = chain.invoke(query) print("\nπŸ“œ RESPONSE:\n") print(response) print("\n" + "=" * 80 + "\n") def get_context(x): query = x.get("input") if isinstance(x, dict) else x docs = retriever.invoke(query) req = parse_rfp_requirement(query) query_terms = extract_graph_keywords_from_requirement(req) graph_context = query_knowledge_graph(query_terms) graph_terms = extract_terms_from_graph_text(graph_context) reranked_chunks = rerank_documents_with_graph_terms(docs, query, graph_terms) return "\n\n".join(reranked_chunks) def get_context_from_requirement(req: dict): query_terms = extract_graph_keywords_from_requirement(req) docs = retriever.invoke(query_terms) graph_context = query_knowledge_graph(query_terms) graph_terms = extract_terms_from_graph_text(graph_context) reranked_chunks = rerank_documents_with_graph_terms( docs, query_terms, graph_terms ) return { "text_context": "\n\n".join(reranked_chunks), "graph_context": graph_context, "requirement_type": req["requirement_type"], "subject": req["subject"], "expected_value": req.get("expected_value", "") } try: vectorstore = FAISS.load_local( INDEX_PATH, embeddings, allow_dangerous_deserialization=True ) retriever = vectorstore.as_retriever( search_type="similarity", search_kwargs={"k": 50, "fetch_k": 100} ) except: print("No Faiss") RFP_DECISION_TEMPLATE = """ You are answering an RFP requirement with risk awareness. Requirement: Type: {requirement_type} Subject: {subject} Expected value: {expected_value} Document evidence: {text_context} Graph evidence: {graph_context} Decision rules: - Answer ONLY with YES, NO or PARTIAL - Do NOT assume anything not explicitly stated - If value differs, answer PARTIAL - If not found, answer NO Confidence rules: - HIGH: Explicit evidence directly answers the requirement - MEDIUM: Evidence partially matches or requires light interpretation - LOW: Requirement is ambiguous OR evidence is indirect OR missing Ambiguity rules: - ambiguity_detected = true if: - The requirement can be interpreted in more than one way - Keywords are vague (e.g. "support", "integration", "capability") - Evidence does not clearly bind to subject + expected value OUTPUT CONSTRAINTS (MANDATORY): - Return ONLY a valid JSON object - Do NOT include explanations, comments, markdown, lists, or code fences - Do NOT write any text before or after the JSON - The response must start with an opening curly brace and end with a closing curly brace JSON schema (return exactly this structure): {{ "answer": "YES | NO | PARTIAL", "confidence": "HIGH | MEDIUM | LOW", "ambiguity_detected": true, "confidence_reason": "", "justification": "", "evidence": [ {{ "quote": "", "source": "" }} ] }} """ prompt = PromptTemplate.from_template(RFP_DECISION_TEMPLATE) parse_requirement_runnable = RunnableLambda( lambda q: parse_rfp_requirement(q) ) chain = ( parse_requirement_runnable | RunnableMap({ "text_context": RunnableLambda( lambda req: get_context_from_requirement(req)["text_context"] ), "graph_context": RunnableLambda( lambda req: get_context_from_requirement(req)["graph_context"] ), "requirement_type": lambda req: req["requirement_type"], "subject": lambda req: req["subject"], "expected_value": lambda req: req.get("expected_value", "") }) | prompt | llm | StrOutputParser() ) def answer_question(question: str) -> str: return chain.invoke(question) # πŸš€ Run if __name__ == "__main__": chat()