commit 842e89eb8fedbf0bf605e4d9c80914daa9e1d43f Author: Cristiano Hoshikawa Date: Thu Jul 10 09:26:27 2025 -0300 First Commit diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..7d9a8e5 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,12 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Environment-dependent path to Maven home directory +/mavenHomeManager.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Zeppelin ignored files +/ZeppelinRemoteNotebooks/ diff --git a/.idea/codeStyles/Project.xml b/.idea/codeStyles/Project.xml new file mode 100644 index 0000000..919ce1f --- /dev/null +++ b/.idea/codeStyles/Project.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml new file mode 100644 index 0000000..a55e7a1 --- /dev/null +++ b/.idea/codeStyles/codeStyleConfig.xml @@ -0,0 +1,5 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..e5af510 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..e4adfc2 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/oci_graph_23ai.iml b/.idea/oci_graph_23ai.iml new file mode 100644 index 0000000..d6ebd48 --- /dev/null +++ b/.idea/oci_graph_23ai.iml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/files/main.py b/files/main.py new file mode 100644 index 0000000..f8cb3c6 --- /dev/null +++ b/files/main.py @@ -0,0 +1,503 @@ +from langchain_community.chat_models.oci_generative_ai import ChatOCIGenAI +from langchain_core.prompts import PromptTemplate +from langchain.schema.output_parser import StrOutputParser +from langchain_community.embeddings import OCIGenAIEmbeddings +from langchain_community.vectorstores import FAISS +from langchain.schema.runnable import RunnableMap +from langchain_community.document_loaders import UnstructuredPDFLoader, PyMuPDFLoader +from langchain_core.documents import Document +from langchain_core.runnables import RunnableLambda + +from tqdm import tqdm +import os +import pickle +import re +import atexit +import oracledb + +# ========================= +# Oracle Autonomous Configuration +# ========================= +WALLET_PATH = "Wallet_oradb23ai" # Your Wallet for Autonomous Database downloaded and unziped folder +DB_ALIAS = "oradb23ai_high" # Your database name plus _high as your TNS Definitions +USERNAME = "USERNAME" # Your Wallet username +PASSWORD = "PASSWORD" # Your Wallet password +os.environ["TNS_ADMIN"] = WALLET_PATH +GRAPH_NAME = "my_graph" + +oracle_conn = oracledb.connect( + user=USERNAME, + password=PASSWORD, + dsn=DB_ALIAS, + config_dir=WALLET_PATH, + wallet_location=WALLET_PATH, + wallet_password=PASSWORD +) +atexit.register(lambda: oracle_conn.close()) + +# ========================= +# Oracle Graph Client +# ========================= +def create_tables_if_not_exist(conn): + cursor = conn.cursor() + + try: + cursor.execute(""" + BEGIN + EXECUTE IMMEDIATE ' + CREATE TABLE ENTITIES ( + ID NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY, + NAME VARCHAR2(500) + ) + '; + EXCEPTION + WHEN OTHERS THEN + IF SQLCODE != -955 THEN + RAISE; + END IF; + END; + """) + cursor.execute(""" + BEGIN + EXECUTE IMMEDIATE ' + CREATE TABLE RELATIONS ( + ID NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY, + SOURCE_ID NUMBER, + TARGET_ID NUMBER, + RELATION_TYPE VARCHAR2(100), + SOURCE_TEXT VARCHAR2(4000) + ) + '; + EXCEPTION + WHEN OTHERS THEN + IF SQLCODE != -955 THEN + RAISE; + END IF; + END; + """) + conn.commit() + print("βœ… ENTITIES and RELATIONS tables created or already exist.") + except Exception as e: + print(f"[ERROR] Failed to create tables: {e}") + finally: + cursor.close() + + +create_tables_if_not_exist(oracle_conn) + +# ========================= +# Global Configurations +# ========================= +INDEX_PATH = "./faiss_index" +PROCESSED_DOCS_FILE = os.path.join(INDEX_PATH, "processed_docs.pkl") +chapter_separator_regex = r"^(#{1,6} .+|\*\*.+\*\*)$" + +# ========================= +# LLM Definitions +# ========================= +llm = ChatOCIGenAI( + model_id="meta.llama-3.1-405b-instruct", + service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com", + compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + auth_profile="DEFAULT", + model_kwargs={"temperature": 0.7, "top_p": 0.75, "max_tokens": 4000}, +) + +llm_for_rag = ChatOCIGenAI( + model_id="meta.llama-3.1-405b-instruct", + service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com", + compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + auth_profile="DEFAULT", +) + +embeddings = OCIGenAIEmbeddings( + model_id="cohere.embed-multilingual-v3.0", + service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com", + compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + auth_profile="DEFAULT", +) + +def create_knowledge_graph(chunks): + cursor = oracle_conn.cursor() + + # Creates graph if it does not exist + try: + cursor.execute(f""" + BEGIN + EXECUTE IMMEDIATE ' + CREATE PROPERTY GRAPH {GRAPH_NAME} + VERTEX TABLES (ENTITIES + KEY (ID) + LABEL ENTITIES + PROPERTIES (NAME)) + EDGE TABLES (RELATIONS + KEY (ID) + SOURCE KEY (SOURCE_ID) REFERENCES ENTITIES(ID) + DESTINATION KEY (TARGET_ID) REFERENCES ENTITIES(ID) + LABEL RELATIONS + PROPERTIES (RELATION_TYPE, SOURCE_TEXT)) + '; + EXCEPTION + WHEN OTHERS THEN + IF SQLCODE != -55358 THEN -- ORA-55358: Graph already exists + RAISE; + END IF; + END; + """) + print(f"🧠 Graph '{GRAPH_NAME}' created or already exists.") + except Exception as e: + print(f"[GRAPH ERROR] Failed to create graph: {e}") + + # Inserting vertices and edges into the tables + for doc in chunks: + text = doc.page_content + source = doc.metadata.get("source", "unknown") + + if not text.strip(): + continue + + prompt = f""" + You are an expert in knowledge extraction. + + Given the following technical text: + + {text} + + Extract key entities and relationships in the format: + - Entity1 -[RELATION]-> Entity2 + + Use UPPERCASE for RELATION types. + Return 'NONE' if nothing found. + """ + try: + response = llm_for_rag.invoke(prompt) + result = response.content.strip() + except Exception as e: + print(f"[ERROR] Gen AI call error: {e}") + continue + + if result.upper() == "NONE": + continue + + triples = result.splitlines() + for triple in triples: + parts = triple.split("-[") + if len(parts) != 2: + continue + + right_part = parts[1].split("]->") + if len(right_part) != 2: + continue + + raw_relation, entity2 = right_part + relation = re.sub(r'\W+', '_', raw_relation.strip().upper()) + entity1 = parts[0].strip() + entity2 = entity2.strip() + + try: + # Insertion of entities (with existence check) + cursor.execute("MERGE INTO ENTITIES e USING (SELECT :name AS NAME FROM dual) src ON (e.name = src.name) WHEN NOT MATCHED THEN INSERT (NAME) VALUES (:name)", [entity1, entity1]) + cursor.execute("MERGE INTO ENTITIES e USING (SELECT :name AS NAME FROM dual) src ON (e.name = src.name) WHEN NOT MATCHED THEN INSERT (NAME) VALUES (:name)", [entity2, entity2]) + # Retrieve the IDs + cursor.execute("SELECT ID FROM ENTITIES WHERE NAME = :name", [entity1]) + source_id = cursor.fetchone()[0] + cursor.execute("SELECT ID FROM ENTITIES WHERE NAME = :name", [entity2]) + target_id = cursor.fetchone()[0] + # Create relations + cursor.execute(""" + INSERT INTO RELATIONS (SOURCE_ID, TARGET_ID, RELATION_TYPE, SOURCE_TEXT) + VALUES (:src, :tgt, :rel, :txt) + """, [source_id, target_id, relation, source]) + print(f"βœ… {entity1} -[{relation}]-> {entity2}") + except Exception as e: + print(f"[INSERT ERROR] {e}") + + oracle_conn.commit() + cursor.close() + print("πŸ’Ύ Knowledge graph updated.") + +def extract_graph_keywords(question: str) -> str: + prompt = f""" + Based on the question below, extract relevant keywords (1 to 2 words per term) that can be used to search for entities and relationships in a technical knowledge graph. + + Question: "{question}" + + Rules: + - Split compound terms (e.g., "API Gateway" β†’ "API", "Gateway") + - Remove duplicates + - Do not include generic words such as: "what", "how", "the", "of", "in the document", etc. + - Return only the keywords, separated by commas. No explanations. + + Result: + """ + try: + resp = llm_for_rag.invoke(prompt) + keywords_raw = resp.content.strip() + + # Additional post-processing: remove duplicates, normalize + keywords = {kw.strip().lower() for kw in re.split(r'[,\n]+', keywords_raw)} + keywords = [kw for kw in keywords if kw] # remove empty strings + return ", ".join(sorted(keywords)) + except Exception as e: + print(f"[KEYWORD EXTRACTION ERROR] {e}") + return "" + +def query_knowledge_graph(query_text): + cursor = oracle_conn.cursor() + + sanitized_text = query_text.lower() + + pgql = f""" + SELECT from_entity, + relation_type, + to_entity + FROM GRAPH_TABLE( + {GRAPH_NAME} + MATCH (e1 is ENTITIES)-[r is RELATIONS]->(e2 is ENTITIES) + WHERE CONTAINS(e1.name, '{sanitized_text}') > 0 + OR CONTAINS(e2.name, '{sanitized_text}') > 0 + OR CONTAINS(r.RELATION_TYPE, '{sanitized_text}') > 0 + COLUMNS ( + e1.name AS from_entity, + r.RELATION_TYPE AS relation_type, + e2.name AS to_entity + ) + ) + FETCH FIRST 20 ROWS ONLY + """ + + print(pgql) + + try: + cursor.execute(pgql) + rows = cursor.fetchall() + if not rows: + return "⚠️ No relationships found in the graph." + + return "\n".join(f"{r[0]} -[{r[1]}]-> {r[2]}" for r in rows) + + except Exception as e: + return f"[PGQL ERROR] {e}" + + finally: + cursor.close() + +def split_llm_output_into_chapters(llm_text): + chapters = [] + current_chapter = [] + lines = llm_text.splitlines() + + for line in lines: + if re.match(chapter_separator_regex, line): + if current_chapter: + chapters.append("\n".join(current_chapter).strip()) + current_chapter = [line] + else: + current_chapter.append(line) + + if current_chapter: + chapters.append("\n".join(current_chapter).strip()) + + return chapters + + +def semantic_chunking(text): + prompt = f""" + You received the following text extracted via OCR: + + {text} + + Your task: + 1. Identify headings (short uppercase or bold lines, no period at the end) + 2. Separate paragraphs by heading + 3. Indicate columns with [COLUMN 1], [COLUMN 2] if present + 4. Indicate tables with [TABLE] in markdown format + """ + + get_out = False + while not get_out: + try: + response = llm_for_rag.invoke(prompt) + get_out = True + except: + print("[ERROR] Gen AI call error") + + return response + + +def read_pdfs(pdf_path): + if "-ocr" in pdf_path: + doc_pages = PyMuPDFLoader(str(pdf_path)).load() + else: + doc_pages = UnstructuredPDFLoader(str(pdf_path)).load() + full_text = "\n".join([page.page_content for page in doc_pages]) + return full_text + + +def smart_split_text(text, max_chunk_size=10_000): + chunks = [] + start = 0 + text_length = len(text) + + while start < text_length: + end = min(start + max_chunk_size, text_length) + split_point = max( + text.rfind('.', start, end), + text.rfind('!', start, end), + text.rfind('?', start, end), + text.rfind('\n\n', start, end) + ) + if split_point == -1 or split_point <= start: + split_point = end + else: + split_point += 1 + + chunk = text[start:split_point].strip() + if chunk: + chunks.append(chunk) + + start = split_point + + return chunks + + +def load_previously_indexed_docs(): + if os.path.exists(PROCESSED_DOCS_FILE): + with open(PROCESSED_DOCS_FILE, "rb") as f: + return pickle.load(f) + return set() + + +def save_indexed_docs(docs): + with open(PROCESSED_DOCS_FILE, "wb") as f: + pickle.dump(docs, f) + + +# ========================= +# Main Function +# ========================= +def chat(): + # pdf_paths = [ + # './Manuals/SOASUITE.pdf', + # './Manuals/using-integrations-oracle-integration-3.pdf' + # ] + + pdf_paths = ['AAAAAAAAAA.pdf'] # Your PDF Files as a Knowledge Base + + already_indexed_docs = load_previously_indexed_docs() + updated_docs = set() + + try: + vectorstore = FAISS.load_local(INDEX_PATH, embeddings, allow_dangerous_deserialization=True) + print("βœ”οΈ FAISS index loaded.") + except Exception: + print("⚠️ FAISS index not found, creating a new one.") + vectorstore = None + + new_chunks = [] + + for pdf_path in tqdm(pdf_paths, desc=f"πŸ“„ Processing PDFs"): + print(f" {os.path.basename(pdf_path)}") + if pdf_path in already_indexed_docs: + print(f"βœ… Document already indexed: {pdf_path}") + continue + full_text = read_pdfs(pdf_path=pdf_path) + + text_chunks = smart_split_text(full_text, max_chunk_size=10_000) + overflow_buffer = "" + + for chunk in tqdm(text_chunks, desc=f"πŸ“„ Processing text chunks", dynamic_ncols=True, leave=False): + current_text = overflow_buffer + chunk + + treated_text = semantic_chunking(current_text) + + if hasattr(treated_text, "content"): + chapters = split_llm_output_into_chapters(treated_text.content) + + last_chapter = chapters[-1] if chapters else "" + + if last_chapter and not last_chapter.strip().endswith((".", "!", "?", "\n\n")): + print("πŸ“Œ Last chapter seems incomplete, saving for the next cycle") + overflow_buffer = last_chapter + chapters = chapters[:-1] + else: + overflow_buffer = "" + + for chapter_text in chapters: + doc = Document(page_content=chapter_text, metadata={"source": pdf_path}) + new_chunks.append(doc) + print(f"βœ… New chapter indexed:\n{chapter_text}...\n") + + else: + print(f"[ERROR] semantic_chunking returned unexpected type: {type(treated_text)}") + + updated_docs.add(str(pdf_path)) + + if new_chunks: + if vectorstore: + vectorstore.add_documents(new_chunks) + else: + vectorstore = FAISS.from_documents(new_chunks, embedding=embeddings) + + vectorstore.save_local(INDEX_PATH) + save_indexed_docs(already_indexed_docs.union(updated_docs)) + print(f"πŸ’Ύ {len(new_chunks)} chunks added to FAISS index.") + + print("🧠 Building knowledge graph...") + create_knowledge_graph(new_chunks) + + else: + print("πŸ“ No new documents to index.") + + retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 50, "fetch_k": 100}) + + template = """ + Document context: + {context} + + Graph context: + {graph_context} + + Question: + {input} + + Interpretation rules: + - You can search for a step-by-step tutorial about a subject + - You can search a concept description about a subject + - You can search for a list of components about a subject + """ + prompt = PromptTemplate.from_template(template) + + def get_context(x): + query = x.get("input") if isinstance(x, dict) else x + return retriever.invoke(query) + + chain = ( + RunnableMap({ + "context": RunnableLambda(get_context), + "graph_context": RunnableLambda(lambda x: query_knowledge_graph(extract_graph_keywords(x.get("input") if isinstance(x, dict) else x))), + "input": lambda x: x.get("input") if isinstance(x, dict) else x + }) + | prompt + | llm + | StrOutputParser() + ) + + print("βœ… READY") + + while True: + query = input("❓ Question (or 'quit' to exit): ") + if query.lower() == "quit": + break + response = chain.invoke(query) + print("\nπŸ“œ RESPONSE:\n") + print(response) + print("\n" + "=" * 80 + "\n") + +# if __name__ == "__main__": +# print("Iniciando") +# print(query_knowledge_graph("gateway")) + +# πŸš€ Run +if __name__ == "__main__": + chat() \ No newline at end of file diff --git a/images/img.png b/images/img.png new file mode 100644 index 0000000..b36dc0b Binary files /dev/null and b/images/img.png differ diff --git a/images/img_1.png b/images/img_1.png new file mode 100644 index 0000000..0b78ad2 Binary files /dev/null and b/images/img_1.png differ diff --git a/index.md b/index.md new file mode 100644 index 0000000..9897016 --- /dev/null +++ b/index.md @@ -0,0 +1,323 @@ +## Understanding Graph Theory and Implementing a Knowledge Graph with Oracle Autonomous Database and PGQL + +### Introduction + +This document explores the concepts of graph theory, knowledge graphs, and how they are implemented using the Oracle Autonomous Database with PGQL (Property Graph Query Language). It also explains the Python implementation used to extract relationships from documents using LLMs and store them as graph structures in Oracle. + +### What is Graph Theory? + +Graph theory is a field of mathematics and computer science focused on modeling relationships between objects. A graph consists of: + +β€’ **Vertices** (nodes): Represent entities. + +β€’ **Edges** (links): Represent relationships between entities. + +Graphs are widely used for representing data structures in social networks, semantic networks, knowledge graphs, and more. + +### What is a Knowledge Graph? + +A knowledge graph is a graph-based representation of real-world knowledge where: + +β€’ Nodes represent entities (people, places, products, etc.) + +β€’ Edges represent semantic relationships (e.g., β€œWORKS_AT”, β€œPART_OF”) + +Knowledge graphs enhance semantic search, recommendation systems, and question-answering applications. + +### Why Use Oracle Autonomous with PGQL? + +Oracle provides a fully managed environment to store and query property graphs: + +β€’ PGQL (Property Graph Query Language) is SQL-like and designed for querying complex graph patterns. + +β€’ Oracle Autonomous Database allows running graph queries natively with property graph features, including creation, querying, and visualization. + +β€’ Integration with LLMs enables automatic extraction of graph structures from unstructured data (like PDFs). + +### Comparison with Other Graph Query Languages + +![img.png](./images/img.png) + +### Advantages of Oracle Autonomous PGQL vs Traditional Graph DBs + +![img_1.png](./images/img_1.png) + +### Create Knowledge Graph + +This is the processes executed on this service: + +β€’ Creating the graph schema + +β€’ Extracting entities and relationships using LLM + +β€’ Inserting data into Oracle + +β€’ Building the property graph + + + +```python +def create_knowledge_graph(chunks): + cursor = oracle_conn.cursor() + + # Creates graph if it does not exist + try: + cursor.execute(f""" + BEGIN + EXECUTE IMMEDIATE ' + CREATE PROPERTY GRAPH {GRAPH_NAME} + VERTEX TABLES (ENTITIES + KEY (ID) + LABEL ENTITIES + PROPERTIES (NAME)) + EDGE TABLES (RELATIONS + KEY (ID) + SOURCE KEY (SOURCE_ID) REFERENCES ENTITIES(ID) + DESTINATION KEY (TARGET_ID) REFERENCES ENTITIES(ID) + LABEL RELATIONS + PROPERTIES (RELATION_TYPE, SOURCE_TEXT)) + '; + EXCEPTION + WHEN OTHERS THEN + IF SQLCODE != -55358 THEN -- ORA-55358: Graph already exists + RAISE; + END IF; + END; + """) + print(f"🧠 Graph '{GRAPH_NAME}' created or already exists.") + except Exception as e: + print(f"[GRAPH ERROR] Failed to create graph: {e}") + + # Inserting vertices and edges into the tables + for doc in chunks: + text = doc.page_content + source = doc.metadata.get("source", "unknown") + + if not text.strip(): + continue + + prompt = f""" + You are an expert in knowledge extraction. + + Given the following technical text: + + {text} + + Extract key entities and relationships in the format: + - Entity1 -[RELATION]-> Entity2 + + Use UPPERCASE for RELATION types. + Return 'NONE' if nothing found. + """ + try: + response = llm_for_rag.invoke(prompt) + result = response.content.strip() + except Exception as e: + print(f"[ERROR] Gen AI call error: {e}") + continue + + if result.upper() == "NONE": + continue + + triples = result.splitlines() + for triple in triples: + parts = triple.split("-[") + if len(parts) != 2: + continue + + right_part = parts[1].split("]->") + if len(right_part) != 2: + continue + + raw_relation, entity2 = right_part + relation = re.sub(r'\W+', '_', raw_relation.strip().upper()) + entity1 = parts[0].strip() + entity2 = entity2.strip() + + try: + # Insertion of entities (with existence check) + cursor.execute("MERGE INTO ENTITIES e USING (SELECT :name AS NAME FROM dual) src ON (e.name = src.name) WHEN NOT MATCHED THEN INSERT (NAME) VALUES (:name)", [entity1, entity1]) + cursor.execute("MERGE INTO ENTITIES e USING (SELECT :name AS NAME FROM dual) src ON (e.name = src.name) WHEN NOT MATCHED THEN INSERT (NAME) VALUES (:name)", [entity2, entity2]) + # Retrieve the IDs + cursor.execute("SELECT ID FROM ENTITIES WHERE NAME = :name", [entity1]) + source_id = cursor.fetchone()[0] + cursor.execute("SELECT ID FROM ENTITIES WHERE NAME = :name", [entity2]) + target_id = cursor.fetchone()[0] + # Create relations + cursor.execute(""" + INSERT INTO RELATIONS (SOURCE_ID, TARGET_ID, RELATION_TYPE, SOURCE_TEXT) + VALUES (:src, :tgt, :rel, :txt) + """, [source_id, target_id, relation, source]) + print(f"βœ… {entity1} -[{relation}]-> {entity2}") + except Exception as e: + print(f"[INSERT ERROR] {e}") + + oracle_conn.commit() + cursor.close() + print("πŸ’Ύ Knowledge graph updated.") +``` + + +β€’ The graph schema is created with CREATE PROPERTY GRAPH, linking ENTITIES (vertices) and RELATIONS (edges). + +β€’ Uses MERGE INTO to insert new entities only if they don’t exist (ensuring uniqueness). + +β€’ LLM (Oracle Generative AI) is used to extract triples of the form Entity1 -[RELATION]-> Entity2. + +β€’ All interactions with Oracle are done via oracledb and PL/SQL anonymous blocks. + +#### Next Steps + + +β€’ Use PGQL to explore and query graph relationships. + +β€’ Connect to Graph Studio for visualizations. + +β€’ Expose the graph through an API REST or LangChain Agent. + +### πŸ“Œ Graph Query Support Functions + +There are two essential functions that enable semantic search and reasoning over the knowledge graph: **extract_graph_keywords** and **query_knowledge_graph**. These components allow questions to be interpreted into meaningful graph queries using PGQL on Oracle Autonomous Database. + +#### extract_graph_keywords + +```python +def extract_graph_keywords(question: str) -> str: + prompt = f""" + Based on the question below, extract relevant keywords (1 to 2 words per term) that can be used to search for entities and relationships in a technical knowledge graph. + + Question: "{question}" + + Rules: + - Split compound terms (e.g., "API Gateway" β†’ "API", "Gateway") + - Remove duplicates + - Do not include generic words such as: "what", "how", "the", "of", "in the document", etc. + - Return only the keywords, separated by commas. No explanations. + + Result: + """ + try: + resp = llm_for_rag.invoke(prompt) + keywords_raw = resp.content.strip() + + # Additional post-processing: remove duplicates, normalize + keywords = {kw.strip().lower() for kw in re.split(r'[,\n]+', keywords_raw)} + keywords = [kw for kw in keywords if kw] # remove empty strings + return ", ".join(sorted(keywords)) + except Exception as e: + print(f"[KEYWORD EXTRACTION ERROR] {e}") + return "" +``` + +βœ… What it does: + +β€’ Uses an LLM (llm_for_rag) to transform natural language questions into a list of graph-friendly keywords. + +β€’ The prompt is designed to cleanly extract entities and terms that are relevant for searching the graph. + +πŸ’‘ Why it’s important: + +β€’ It bridges the gap between unstructured questions and structured queries. + +β€’ Ensures that only specific, domain-relevant terms are used for matching in the PGQL query. + +🧠 LLM-enhanced behavior: + +β€’ Breaks compound technical terms. + +β€’ Removes stop words (like β€œwhat”, β€œhow”, etc.). + +β€’ Normalizes text by lowercasing and deduplicating terms. + +πŸ“Œ Example: + + Input: + + "What are the main components of an API Gateway architecture?" + + + Output keywords: + + api, gateway, architecture, components + +#### query_knowledge_graph + +```python +def query_knowledge_graph(query_text): + cursor = oracle_conn.cursor() + + sanitized_text = query_text.lower() + + pgql = f""" + SELECT from_entity, + relation_type, + to_entity + FROM GRAPH_TABLE( + {GRAPH_NAME} + MATCH (e1 is ENTITIES)-[r is RELATIONS]->(e2 is ENTITIES) + WHERE CONTAINS(e1.name, '{sanitized_text}') > 0 + OR CONTAINS(e2.name, '{sanitized_text}') > 0 + OR CONTAINS(r.RELATION_TYPE, '{sanitized_text}') > 0 + COLUMNS ( + e1.name AS from_entity, + r.RELATION_TYPE AS relation_type, + e2.name AS to_entity + ) + ) + FETCH FIRST 20 ROWS ONLY + """ + + print(pgql) + + try: + cursor.execute(pgql) + rows = cursor.fetchall() + if not rows: + return "⚠️ No relationships found in the graph." + + return "\n".join(f"{r[0]} -[{r[1]}]-> {r[2]}" for r in rows) + + except Exception as e: + return f"[PGQL ERROR] {e}" + + finally: + cursor.close() +``` + + +βœ… What it does: + +β€’ Accepts a keyword-based string (often produced by extract_graph_keywords) and constructs a PGQL query to retrieve relationships from the knowledge graph. + +βš™οΈ Key mechanics: + +β€’ The GRAPH_TABLE clause uses MATCH to traverse the graph from source to target node. + +β€’ It uses CONTAINS() to allow partial and fuzzy search in node/edge attributes (e1.name, e2.name, r.RELATION_TYPE). + +β€’ Limits results to 20 to avoid flooding the output. + +πŸ†š Why use Oracle PGQL: + +β€’ PGQL (Property Graph Query Language) is SQL-like but designed for graph traversal. + +β€’ Oracle Autonomous Database supports property graphs, which allows seamless integration between relational and graph worlds. + +β€’ Offers indexing, optimization, and native graph search capabilities that are enterprise-ready. + +🧠 Oracle-Specific Notes: + +β€’ The GRAPH_TABLE() is unique to Oracle PGQL and allows queries over logical views of graphs defined via relational tables. + +β€’ Unlike Cypher (Neo4j), PGQL runs over structured data using SQL extensions, making it friendlier in RDBMS-heavy environments. + +## Reference + +- [Analyze PDF Documents in Natural Language with OCI Generative AI](https://docs.oracle.com/en/learn/oci-genai-pdf/) +- [Oracle Graph Learning Path](https://blogs.oracle.com/database/post/oracle-graph-learning-path) +- [Graph Developer's Guide for Property Graph](https://docs.oracle.com/en/database/oracle/property-graph/25.2/spgdg/oracle-graph-python-client.html#GUID-9800E556-0B6C-4EAF-A4FC-9AE9AB46023C) +- [Getting Started with Property Graphs in Oracle Database 23ai](https://blogs.oracle.com/ace/post/getting-started-with-property-graphs-in-oracle-database-23ai) + +## Acknowledgments + +- **Author** - Cristiano Hoshikawa (Oracle LAD A-Team Solution Engineer)