First Commit

2026-03-06 10:11:04 +00:00 · 2025-07-10 09:26:27 -03:00
commit 842e89eb8f
11 changed files with 879 additions and 0 deletions
--- a/files/main.py
+++ b/files/main.py
@@ -0,0 +1,503 @@
+from langchain_community.chat_models.oci_generative_ai import ChatOCIGenAI
+from langchain_core.prompts import PromptTemplate
+from langchain.schema.output_parser import StrOutputParser
+from langchain_community.embeddings import OCIGenAIEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain.schema.runnable import RunnableMap
+from langchain_community.document_loaders import UnstructuredPDFLoader, PyMuPDFLoader
+from langchain_core.documents import Document
+from langchain_core.runnables import RunnableLambda
+
+from tqdm import tqdm
+import os
+import pickle
+import re
+import atexit
+import oracledb
+
+# =========================
+# Oracle Autonomous Configuration
+# =========================
+WALLET_PATH = "Wallet_oradb23ai" # Your Wallet for Autonomous Database downloaded and unziped folder
+DB_ALIAS = "oradb23ai_high" # Your database name plus _high as your TNS Definitions
+USERNAME = "USERNAME" # Your Wallet username
+PASSWORD = "PASSWORD" # Your Wallet password
+os.environ["TNS_ADMIN"] = WALLET_PATH
+GRAPH_NAME = "my_graph"
+
+oracle_conn = oracledb.connect(
+    user=USERNAME,
+    password=PASSWORD,
+    dsn=DB_ALIAS,
+    config_dir=WALLET_PATH,
+    wallet_location=WALLET_PATH,
+    wallet_password=PASSWORD
+)
+atexit.register(lambda: oracle_conn.close())
+
+# =========================
+# Oracle Graph Client
+# =========================
+def create_tables_if_not_exist(conn):
+    cursor = conn.cursor()
+
+    try:
+        cursor.execute("""
+            BEGIN
+                EXECUTE IMMEDIATE '
+                    CREATE TABLE ENTITIES (
+                        ID NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY,
+                        NAME VARCHAR2(500)
+                    )
+                ';
+            EXCEPTION
+                WHEN OTHERS THEN
+                    IF SQLCODE != -955 THEN
+                        RAISE;
+                    END IF;
+            END;
+        """)
+        cursor.execute("""
+            BEGIN
+                EXECUTE IMMEDIATE '
+                    CREATE TABLE RELATIONS (
+                        ID NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY,
+                        SOURCE_ID NUMBER,
+                        TARGET_ID NUMBER,
+                        RELATION_TYPE VARCHAR2(100),
+                        SOURCE_TEXT VARCHAR2(4000)
+                    )
+                ';
+            EXCEPTION
+                WHEN OTHERS THEN
+                    IF SQLCODE != -955 THEN
+                        RAISE;
+                    END IF;
+            END;
+        """)
+        conn.commit()
+        print("✅ ENTITIES and RELATIONS tables created or already exist.")
+    except Exception as e:
+        print(f"[ERROR] Failed to create tables: {e}")
+    finally:
+        cursor.close()
+
+
+create_tables_if_not_exist(oracle_conn)
+
+# =========================
+# Global Configurations
+# =========================
+INDEX_PATH = "./faiss_index"
+PROCESSED_DOCS_FILE = os.path.join(INDEX_PATH, "processed_docs.pkl")
+chapter_separator_regex = r"^(#{1,6} .+|\*\*.+\*\*)$"
+
+# =========================
+# LLM Definitions
+# =========================
+llm = ChatOCIGenAI(
+    model_id="meta.llama-3.1-405b-instruct",
+    service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
+    compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+    auth_profile="DEFAULT",
+    model_kwargs={"temperature": 0.7, "top_p": 0.75, "max_tokens": 4000},
+)
+
+llm_for_rag = ChatOCIGenAI(
+    model_id="meta.llama-3.1-405b-instruct",
+    service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
+    compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+    auth_profile="DEFAULT",
+)
+
+embeddings = OCIGenAIEmbeddings(
+    model_id="cohere.embed-multilingual-v3.0",
+    service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
+    compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+    auth_profile="DEFAULT",
+)
+
+def create_knowledge_graph(chunks):
+    cursor = oracle_conn.cursor()
+
+    # Creates graph if it does not exist
+    try:
+        cursor.execute(f"""
+        BEGIN
+            EXECUTE IMMEDIATE '
+                CREATE PROPERTY GRAPH {GRAPH_NAME}
+                  VERTEX TABLES (ENTITIES
+                    KEY (ID)
+                    LABEL ENTITIES
+                    PROPERTIES (NAME))
+                  EDGE TABLES (RELATIONS
+                    KEY (ID)
+                    SOURCE KEY (SOURCE_ID) REFERENCES ENTITIES(ID)
+                    DESTINATION KEY (TARGET_ID) REFERENCES ENTITIES(ID)
+                    LABEL RELATIONS
+                    PROPERTIES (RELATION_TYPE, SOURCE_TEXT))
+            ';
+        EXCEPTION
+            WHEN OTHERS THEN
+                IF SQLCODE != -55358 THEN -- ORA-55358: Graph already exists
+                    RAISE;
+                END IF;
+        END;
+        """)
+        print(f"🧠 Graph '{GRAPH_NAME}' created or already exists.")
+    except Exception as e:
+        print(f"[GRAPH ERROR] Failed to create graph: {e}")
+
+    # Inserting vertices and edges into the tables
+    for doc in chunks:
+        text = doc.page_content
+        source = doc.metadata.get("source", "unknown")
+
+        if not text.strip():
+            continue
+
+        prompt = f"""
+        You are an expert in knowledge extraction.
+
+        Given the following technical text:
+
+        {text}
+
+        Extract key entities and relationships in the format:
+        - Entity1 -[RELATION]-> Entity2
+
+        Use UPPERCASE for RELATION types.
+        Return 'NONE' if nothing found.
+        """
+        try:
+            response = llm_for_rag.invoke(prompt)
+            result = response.content.strip()
+        except Exception as e:
+            print(f"[ERROR] Gen AI call error: {e}")
+            continue
+
+        if result.upper() == "NONE":
+            continue
+
+        triples = result.splitlines()
+        for triple in triples:
+            parts = triple.split("-[")
+            if len(parts) != 2:
+                continue
+
+            right_part = parts[1].split("]->")
+            if len(right_part) != 2:
+                continue
+
+            raw_relation, entity2 = right_part
+            relation = re.sub(r'\W+', '_', raw_relation.strip().upper())
+            entity1 = parts[0].strip()
+            entity2 = entity2.strip()
+
+            try:
+                # Insertion of entities (with existence check)
+                cursor.execute("MERGE INTO ENTITIES e USING (SELECT :name AS NAME FROM dual) src ON (e.name = src.name) WHEN NOT MATCHED THEN INSERT (NAME) VALUES (:name)", [entity1, entity1])
+                cursor.execute("MERGE INTO ENTITIES e USING (SELECT :name AS NAME FROM dual) src ON (e.name = src.name) WHEN NOT MATCHED THEN INSERT (NAME) VALUES (:name)", [entity2, entity2])
+                # Retrieve the IDs
+                cursor.execute("SELECT ID FROM ENTITIES WHERE NAME = :name", [entity1])
+                source_id = cursor.fetchone()[0]
+                cursor.execute("SELECT ID FROM ENTITIES WHERE NAME = :name", [entity2])
+                target_id = cursor.fetchone()[0]
+                # Create relations
+                cursor.execute("""
+                    INSERT INTO RELATIONS (SOURCE_ID, TARGET_ID, RELATION_TYPE, SOURCE_TEXT)
+                    VALUES (:src, :tgt, :rel, :txt)
+                """, [source_id, target_id, relation, source])
+                print(f"✅ {entity1} -[{relation}]-> {entity2}")
+            except Exception as e:
+                print(f"[INSERT ERROR] {e}")
+
+    oracle_conn.commit()
+    cursor.close()
+    print("💾 Knowledge graph updated.")
+
+def extract_graph_keywords(question: str) -> str:
+    prompt = f"""
+    Based on the question below, extract relevant keywords (1 to 2 words per term) that can be used to search for entities and relationships in a technical knowledge graph.
+
+    Question: "{question}"
+
+    Rules:
+    - Split compound terms (e.g., "API Gateway" → "API", "Gateway")
+    - Remove duplicates
+    - Do not include generic words such as: "what", "how", "the", "of", "in the document", etc.
+    - Return only the keywords, separated by commas. No explanations.
+
+    Result:
+    """
+    try:
+        resp = llm_for_rag.invoke(prompt)
+        keywords_raw = resp.content.strip()
+
+        # Additional post-processing: remove duplicates, normalize
+        keywords = {kw.strip().lower() for kw in re.split(r'[,\n]+', keywords_raw)}
+        keywords = [kw for kw in keywords if kw]  # remove empty strings
+        return ", ".join(sorted(keywords))
+    except Exception as e:
+        print(f"[KEYWORD EXTRACTION ERROR] {e}")
+        return ""
+
+def query_knowledge_graph(query_text):
+    cursor = oracle_conn.cursor()
+
+    sanitized_text = query_text.lower()
+
+    pgql = f"""
+        SELECT from_entity,
+               relation_type,
+               to_entity
+        FROM GRAPH_TABLE(
+            {GRAPH_NAME}
+            MATCH (e1 is ENTITIES)-[r is RELATIONS]->(e2 is ENTITIES)
+            WHERE CONTAINS(e1.name, '{sanitized_text}') > 0
+               OR CONTAINS(e2.name, '{sanitized_text}') > 0
+               OR CONTAINS(r.RELATION_TYPE, '{sanitized_text}') > 0
+            COLUMNS (
+                e1.name AS from_entity,
+                r.RELATION_TYPE AS relation_type,
+                e2.name AS to_entity
+            )
+        )
+        FETCH FIRST 20 ROWS ONLY
+    """
+
+    print(pgql)
+
+    try:
+        cursor.execute(pgql)
+        rows = cursor.fetchall()
+        if not rows:
+            return "⚠️ No relationships found in the graph."
+
+        return "\n".join(f"{r[0]} -[{r[1]}]-> {r[2]}" for r in rows)
+
+    except Exception as e:
+        return f"[PGQL ERROR] {e}"
+
+    finally:
+        cursor.close()
+
+def split_llm_output_into_chapters(llm_text):
+    chapters = []
+    current_chapter = []
+    lines = llm_text.splitlines()
+
+    for line in lines:
+        if re.match(chapter_separator_regex, line):
+            if current_chapter:
+                chapters.append("\n".join(current_chapter).strip())
+            current_chapter = [line]
+        else:
+            current_chapter.append(line)
+
+    if current_chapter:
+        chapters.append("\n".join(current_chapter).strip())
+
+    return chapters
+
+
+def semantic_chunking(text):
+    prompt = f"""
+    You received the following text extracted via OCR:
+
+    {text}
+
+    Your task:
+    1. Identify headings (short uppercase or bold lines, no period at the end)
+    2. Separate paragraphs by heading
+    3. Indicate columns with [COLUMN 1], [COLUMN 2] if present
+    4. Indicate tables with [TABLE] in markdown format
+    """
+
+    get_out = False
+    while not get_out:
+        try:
+            response = llm_for_rag.invoke(prompt)
+            get_out = True
+        except:
+            print("[ERROR] Gen AI call error")
+
+    return response
+
+
+def read_pdfs(pdf_path):
+    if "-ocr" in pdf_path:
+        doc_pages = PyMuPDFLoader(str(pdf_path)).load()
+    else:
+        doc_pages = UnstructuredPDFLoader(str(pdf_path)).load()
+    full_text = "\n".join([page.page_content for page in doc_pages])
+    return full_text
+
+
+def smart_split_text(text, max_chunk_size=10_000):
+    chunks = []
+    start = 0
+    text_length = len(text)
+
+    while start < text_length:
+        end = min(start + max_chunk_size, text_length)
+        split_point = max(
+            text.rfind('.', start, end),
+            text.rfind('!', start, end),
+            text.rfind('?', start, end),
+            text.rfind('\n\n', start, end)
+        )
+        if split_point == -1 or split_point <= start:
+            split_point = end
+        else:
+            split_point += 1
+
+        chunk = text[start:split_point].strip()
+        if chunk:
+            chunks.append(chunk)
+
+        start = split_point
+
+    return chunks
+
+
+def load_previously_indexed_docs():
+    if os.path.exists(PROCESSED_DOCS_FILE):
+        with open(PROCESSED_DOCS_FILE, "rb") as f:
+            return pickle.load(f)
+    return set()
+
+
+def save_indexed_docs(docs):
+    with open(PROCESSED_DOCS_FILE, "wb") as f:
+        pickle.dump(docs, f)
+
+
+# =========================
+# Main Function
+# =========================
+def chat():
+    # pdf_paths = [
+    #     './Manuals/SOASUITE.pdf',
+    #     './Manuals/using-integrations-oracle-integration-3.pdf'
+    # ]
+
+    pdf_paths = ['AAAAAAAAAA.pdf'] # Your PDF Files as a Knowledge Base
+
+    already_indexed_docs = load_previously_indexed_docs()
+    updated_docs = set()
+
+    try:
+        vectorstore = FAISS.load_local(INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
+        print("✔️ FAISS index loaded.")
+    except Exception:
+        print("⚠️ FAISS index not found, creating a new one.")
+        vectorstore = None
+
+    new_chunks = []
+
+    for pdf_path in tqdm(pdf_paths, desc=f"📄 Processing PDFs"):
+        print(f" {os.path.basename(pdf_path)}")
+        if pdf_path in already_indexed_docs:
+            print(f"✅ Document already indexed: {pdf_path}")
+            continue
+        full_text = read_pdfs(pdf_path=pdf_path)
+
+        text_chunks = smart_split_text(full_text, max_chunk_size=10_000)
+        overflow_buffer = ""
+
+        for chunk in tqdm(text_chunks, desc=f"📄 Processing text chunks", dynamic_ncols=True, leave=False):
+            current_text = overflow_buffer + chunk
+
+            treated_text = semantic_chunking(current_text)
+
+            if hasattr(treated_text, "content"):
+                chapters = split_llm_output_into_chapters(treated_text.content)
+
+                last_chapter = chapters[-1] if chapters else ""
+
+                if last_chapter and not last_chapter.strip().endswith((".", "!", "?", "\n\n")):
+                    print("📌 Last chapter seems incomplete, saving for the next cycle")
+                    overflow_buffer = last_chapter
+                    chapters = chapters[:-1]
+                else:
+                    overflow_buffer = ""
+
+                for chapter_text in chapters:
+                    doc = Document(page_content=chapter_text, metadata={"source": pdf_path})
+                    new_chunks.append(doc)
+                    print(f"✅ New chapter indexed:\n{chapter_text}...\n")
+
+            else:
+                print(f"[ERROR] semantic_chunking returned unexpected type: {type(treated_text)}")
+
+        updated_docs.add(str(pdf_path))
+
+    if new_chunks:
+        if vectorstore:
+            vectorstore.add_documents(new_chunks)
+        else:
+            vectorstore = FAISS.from_documents(new_chunks, embedding=embeddings)
+
+        vectorstore.save_local(INDEX_PATH)
+        save_indexed_docs(already_indexed_docs.union(updated_docs))
+        print(f"💾 {len(new_chunks)} chunks added to FAISS index.")
+
+        print("🧠 Building knowledge graph...")
+        create_knowledge_graph(new_chunks)
+
+    else:
+        print("📁 No new documents to index.")
+
+    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 50, "fetch_k": 100})
+
+    template = """ 
+        Document context:
+        {context}
+        
+        Graph context:
+        {graph_context}
+        
+        Question:
+        {input}
+        
+        Interpretation rules:
+        - You can search for a step-by-step tutorial about a subject
+        - You can search a concept description about a subject
+        - You can search for a list of components about a subject
+    """
+    prompt = PromptTemplate.from_template(template)
+
+    def get_context(x):
+        query = x.get("input") if isinstance(x, dict) else x
+        return retriever.invoke(query)
+
+    chain = (
+            RunnableMap({
+                "context": RunnableLambda(get_context),
+                "graph_context": RunnableLambda(lambda x: query_knowledge_graph(extract_graph_keywords(x.get("input") if isinstance(x, dict) else x))),
+                "input": lambda x: x.get("input") if isinstance(x, dict) else x
+            })
+            | prompt
+            | llm
+            | StrOutputParser()
+    )
+
+    print("✅ READY")
+
+    while True:
+        query = input("❓ Question (or 'quit' to exit): ")
+        if query.lower() == "quit":
+            break
+        response = chain.invoke(query)
+        print("\n📜 RESPONSE:\n")
+        print(response)
+        print("\n" + "=" * 80 + "\n")
+
+# if __name__ == "__main__":
+#     print("Iniciando")
+#     print(query_knowledge_graph("gateway"))
+
+# 🚀 Run
+if __name__ == "__main__":
+    chat()