oci_graph_23ai/files/main.py

from langchain_community.chat_models.oci_generative_ai import ChatOCIGenAI
from langchain_core.prompts import PromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain_community.embeddings import OCIGenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema.runnable import RunnableMap
from langchain_community.document_loaders import UnstructuredPDFLoader, PyMuPDFLoader
from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda

from tqdm import tqdm
import os
import pickle
import re
import atexit
import oracledb

# =========================
# Oracle Autonomous Configuration
# =========================
WALLET_PATH = "Wallet_oradb23ai" # Your Wallet for Autonomous Database downloaded and unziped folder
DB_ALIAS = "oradb23ai_high" # Your database name plus _high as your TNS Definitions
USERNAME = "USERNAME" # Your Wallet username
PASSWORD = "PASSWORD" # Your Wallet password
os.environ["TNS_ADMIN"] = WALLET_PATH
GRAPH_NAME = "my_graph"

oracle_conn = oracledb.connect(
    user=USERNAME,
    password=PASSWORD,
    dsn=DB_ALIAS,
    config_dir=WALLET_PATH,
    wallet_location=WALLET_PATH,
    wallet_password=PASSWORD
)
atexit.register(lambda: oracle_conn.close())

# =========================
# Oracle Graph Client
# =========================
def create_tables_if_not_exist(conn):
    cursor = conn.cursor()

    try:
        cursor.execute("""
            BEGIN
                EXECUTE IMMEDIATE '
                    CREATE TABLE ENTITIES (
                        ID NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY,
                        NAME VARCHAR2(500)
                    )
                ';
            EXCEPTION
                WHEN OTHERS THEN
                    IF SQLCODE != -955 THEN
                        RAISE;
                    END IF;
            END;
        """)
        cursor.execute("""
            BEGIN
                EXECUTE IMMEDIATE '
                    CREATE TABLE RELATIONS (
                        ID NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY,
                        SOURCE_ID NUMBER,
                        TARGET_ID NUMBER,
                        RELATION_TYPE VARCHAR2(100),
                        SOURCE_TEXT VARCHAR2(4000)
                    )
                ';
            EXCEPTION
                WHEN OTHERS THEN
                    IF SQLCODE != -955 THEN
                        RAISE;
                    END IF;
            END;
        """)
        conn.commit()
        print("✅ ENTITIES and RELATIONS tables created or already exist.")
    except Exception as e:
        print(f"[ERROR] Failed to create tables: {e}")
    finally:
        cursor.close()


create_tables_if_not_exist(oracle_conn)

# =========================
# Global Configurations
# =========================
INDEX_PATH = "./faiss_index"
PROCESSED_DOCS_FILE = os.path.join(INDEX_PATH, "processed_docs.pkl")
chapter_separator_regex = r"^(#{1,6} .+|\*\*.+\*\*)$"

# =========================
# LLM Definitions
# =========================
llm = ChatOCIGenAI(
    model_id="meta.llama-3.1-405b-instruct",
    service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
    compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
    auth_profile="DEFAULT",
    model_kwargs={"temperature": 0.7, "top_p": 0.75, "max_tokens": 4000},
)

llm_for_rag = ChatOCIGenAI(
    model_id="meta.llama-3.1-405b-instruct",
    service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
    compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
    auth_profile="DEFAULT",
)

embeddings = OCIGenAIEmbeddings(
    model_id="cohere.embed-multilingual-v3.0",
    service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
    compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
    auth_profile="DEFAULT",
)

def create_knowledge_graph(chunks):
    cursor = oracle_conn.cursor()

    # Creates graph if it does not exist
    try:
        cursor.execute(f"""
        BEGIN
            EXECUTE IMMEDIATE '
                CREATE PROPERTY GRAPH {GRAPH_NAME}
                  VERTEX TABLES (ENTITIES
                    KEY (ID)
                    LABEL ENTITIES
                    PROPERTIES (NAME))
                  EDGE TABLES (RELATIONS
                    KEY (ID)
                    SOURCE KEY (SOURCE_ID) REFERENCES ENTITIES(ID)
                    DESTINATION KEY (TARGET_ID) REFERENCES ENTITIES(ID)
                    LABEL RELATIONS
                    PROPERTIES (RELATION_TYPE, SOURCE_TEXT))
            ';
        EXCEPTION
            WHEN OTHERS THEN
                IF SQLCODE != -55358 THEN -- ORA-55358: Graph already exists
                    RAISE;
                END IF;
        END;
        """)
        print(f"🧠 Graph '{GRAPH_NAME}' created or already exists.")
    except Exception as e:
        print(f"[GRAPH ERROR] Failed to create graph: {e}")

    # Inserting vertices and edges into the tables
    for doc in chunks:
        text = doc.page_content
        source = doc.metadata.get("source", "unknown")

        if not text.strip():
            continue

        prompt = f"""
        You are an expert in knowledge extraction.

        Given the following technical text:

        {text}

        Extract key entities and relationships in the format:
        - Entity1 -[RELATION]-> Entity2

        Use UPPERCASE for RELATION types.
        Return 'NONE' if nothing found.
        """
        try:
            response = llm_for_rag.invoke(prompt)
            result = response.content.strip()
        except Exception as e:
            print(f"[ERROR] Gen AI call error: {e}")
            continue

        if result.upper() == "NONE":
            continue

        triples = result.splitlines()
        for triple in triples:
            parts = triple.split("-[")
            if len(parts) != 2:
                continue

            right_part = parts[1].split("]->")
            if len(right_part) != 2:
                continue

            raw_relation, entity2 = right_part
            relation = re.sub(r'\W+', '_', raw_relation.strip().upper())
            entity1 = parts[0].strip()
            entity2 = entity2.strip()

            try:
                # Insertion of entities (with existence check)
                cursor.execute("MERGE INTO ENTITIES e USING (SELECT :name AS NAME FROM dual) src ON (e.name = src.name) WHEN NOT MATCHED THEN INSERT (NAME) VALUES (:name)", [entity1, entity1])
                cursor.execute("MERGE INTO ENTITIES e USING (SELECT :name AS NAME FROM dual) src ON (e.name = src.name) WHEN NOT MATCHED THEN INSERT (NAME) VALUES (:name)", [entity2, entity2])
                # Retrieve the IDs
                cursor.execute("SELECT ID FROM ENTITIES WHERE NAME = :name", [entity1])
                source_id = cursor.fetchone()[0]
                cursor.execute("SELECT ID FROM ENTITIES WHERE NAME = :name", [entity2])
                target_id = cursor.fetchone()[0]
                # Create relations
                cursor.execute("""
                    INSERT INTO RELATIONS (SOURCE_ID, TARGET_ID, RELATION_TYPE, SOURCE_TEXT)
                    VALUES (:src, :tgt, :rel, :txt)
                """, [source_id, target_id, relation, source])
                print(f"✅ {entity1} -[{relation}]-> {entity2}")
            except Exception as e:
                print(f"[INSERT ERROR] {e}")

    oracle_conn.commit()
    cursor.close()
    print("💾 Knowledge graph updated.")

def extract_graph_keywords(question: str) -> str:
    prompt = f"""
    Based on the question below, extract relevant keywords (1 to 2 words per term) that can be used to search for entities and relationships in a technical knowledge graph.

    Question: "{question}"

    Rules:
    - Split compound terms (e.g., "API Gateway" → "API", "Gateway")
    - Remove duplicates
    - Do not include generic words such as: "what", "how", "the", "of", "in the document", etc.
    - Return only the keywords, separated by commas. No explanations.

    Result:
    """
    try:
        resp = llm_for_rag.invoke(prompt)
        keywords_raw = resp.content.strip()

        # Additional post-processing: remove duplicates, normalize
        keywords = {kw.strip().lower() for kw in re.split(r'[,\n]+', keywords_raw)}
        keywords = [kw for kw in keywords if kw]  # remove empty strings
        return ", ".join(sorted(keywords))
    except Exception as e:
        print(f"[KEYWORD EXTRACTION ERROR] {e}")
        return ""

def query_knowledge_graph(query_text):
    cursor = oracle_conn.cursor()

    sanitized_text = query_text.lower()

    pgql = f"""
        SELECT from_entity,
               relation_type,
               to_entity
        FROM GRAPH_TABLE(
            {GRAPH_NAME}
            MATCH (e1 is ENTITIES)-[r is RELATIONS]->(e2 is ENTITIES)
            WHERE CONTAINS(e1.name, '{sanitized_text}') > 0
               OR CONTAINS(e2.name, '{sanitized_text}') > 0
               OR CONTAINS(r.RELATION_TYPE, '{sanitized_text}') > 0
            COLUMNS (
                e1.name AS from_entity,
                r.RELATION_TYPE AS relation_type,
                e2.name AS to_entity
            )
        )
        FETCH FIRST 20 ROWS ONLY
    """

    # Show the query formulated for Graph
    print(pgql)

    try:
        cursor.execute(pgql)
        rows = cursor.fetchall()
        if not rows:
            return "⚠️ No relationships found in the graph."

        return "\n".join(f"{r[0]} -[{r[1]}]-> {r[2]}" for r in rows)

    except Exception as e:
        return f"[PGQL ERROR] {e}"

    finally:
        cursor.close()

def split_llm_output_into_chapters(llm_text):
    chapters = []
    current_chapter = []
    lines = llm_text.splitlines()

    for line in lines:
        if re.match(chapter_separator_regex, line):
            if current_chapter:
                chapters.append("\n".join(current_chapter).strip())
            current_chapter = [line]
        else:
            current_chapter.append(line)

    if current_chapter:
        chapters.append("\n".join(current_chapter).strip())

    return chapters


def semantic_chunking(text):
    prompt = f"""
    You received the following text extracted via OCR:

    {text}

    Your task:
    1. Identify headings (short uppercase or bold lines, no period at the end)
    2. Separate paragraphs by heading
    3. Indicate columns with [COLUMN 1], [COLUMN 2] if present
    4. Indicate tables with [TABLE] in markdown format
    """

    get_out = False
    while not get_out:
        try:
            response = llm_for_rag.invoke(prompt)
            get_out = True
        except:
            print("[ERROR] Gen AI call error")

    return response


def read_pdfs(pdf_path):
    if "-ocr" in pdf_path:
        doc_pages = PyMuPDFLoader(str(pdf_path)).load()
    else:
        doc_pages = UnstructuredPDFLoader(str(pdf_path)).load()
    full_text = "\n".join([page.page_content for page in doc_pages])
    return full_text


def smart_split_text(text, max_chunk_size=10_000):
    chunks = []
    start = 0
    text_length = len(text)

    while start < text_length:
        end = min(start + max_chunk_size, text_length)
        split_point = max(
            text.rfind('.', start, end),
            text.rfind('!', start, end),
            text.rfind('?', start, end),
            text.rfind('\n\n', start, end)
        )
        if split_point == -1 or split_point <= start:
            split_point = end
        else:
            split_point += 1

        chunk = text[start:split_point].strip()
        if chunk:
            chunks.append(chunk)

        start = split_point

    return chunks


def load_previously_indexed_docs():
    if os.path.exists(PROCESSED_DOCS_FILE):
        with open(PROCESSED_DOCS_FILE, "rb") as f:
            return pickle.load(f)
    return set()


def save_indexed_docs(docs):
    with open(PROCESSED_DOCS_FILE, "wb") as f:
        pickle.dump(docs, f)


# =========================
# Main Function
# =========================
def chat():
    pdf_paths = ['AAAAAAAAAA.pdf'] # Your PDF Files as a Knowledge Base

    already_indexed_docs = load_previously_indexed_docs()
    updated_docs = set()

    try:
        vectorstore = FAISS.load_local(INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
        print("✔️ FAISS index loaded.")
    except Exception:
        print("⚠️ FAISS index not found, creating a new one.")
        vectorstore = None

    new_chunks = []

    for pdf_path in tqdm(pdf_paths, desc=f"📄 Processing PDFs"):
        print(f" {os.path.basename(pdf_path)}")
        if pdf_path in already_indexed_docs:
            print(f"✅ Document already indexed: {pdf_path}")
            continue
        full_text = read_pdfs(pdf_path=pdf_path)

        text_chunks = smart_split_text(full_text, max_chunk_size=10_000)
        overflow_buffer = ""

        for chunk in tqdm(text_chunks, desc=f"📄 Processing text chunks", dynamic_ncols=True, leave=False):
            current_text = overflow_buffer + chunk

            treated_text = semantic_chunking(current_text)

            if hasattr(treated_text, "content"):
                chapters = split_llm_output_into_chapters(treated_text.content)

                last_chapter = chapters[-1] if chapters else ""

                if last_chapter and not last_chapter.strip().endswith((".", "!", "?", "\n\n")):
                    print("📌 Last chapter seems incomplete, saving for the next cycle")
                    overflow_buffer = last_chapter
                    chapters = chapters[:-1]
                else:
                    overflow_buffer = ""

                for chapter_text in chapters:
                    doc = Document(page_content=chapter_text, metadata={"source": pdf_path})
                    new_chunks.append(doc)
                    print(f"✅ New chapter indexed:\n{chapter_text}...\n")

            else:
                print(f"[ERROR] semantic_chunking returned unexpected type: {type(treated_text)}")

        updated_docs.add(str(pdf_path))

    if new_chunks:
        if vectorstore:
            vectorstore.add_documents(new_chunks)
        else:
            vectorstore = FAISS.from_documents(new_chunks, embedding=embeddings)

        vectorstore.save_local(INDEX_PATH)
        save_indexed_docs(already_indexed_docs.union(updated_docs))
        print(f"💾 {len(new_chunks)} chunks added to FAISS index.")

        print("🧠 Building knowledge graph...")
        create_knowledge_graph(new_chunks)

    else:
        print("📁 No new documents to index.")

    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 50, "fetch_k": 100})

    template = """
        Document context:
        {context}

        Graph context:
        {graph_context}

        Question:
        {input}

        Interpretation rules:
        - You can search for a step-by-step tutorial about a subject
        - You can search a concept description about a subject
        - You can search for a list of components about a subject
    """
    prompt = PromptTemplate.from_template(template)

    def get_context(x):
        query = x.get("input") if isinstance(x, dict) else x
        return retriever.invoke(query)

    chain = (
            RunnableMap({
                "context": RunnableLambda(get_context),
                "graph_context": RunnableLambda(lambda x: query_knowledge_graph(extract_graph_keywords(x.get("input") if isinstance(x, dict) else x))),
                "input": lambda x: x.get("input") if isinstance(x, dict) else x
            })
            | prompt
            | llm
            | StrOutputParser()
    )

    print("✅ READY")

    while True:
        query = input("❓ Question (or 'quit' to exit): ")
        if query.lower() == "quit":
            break
        response = chain.invoke(query)
        print("\n📜 RESPONSE:\n")
        print(response)
        print("\n" + "=" * 80 + "\n")

# 🚀 Run
if __name__ == "__main__":
    chat()