mirror of
https://github.com/hoshikawa2/oci_graph_23ai.git
synced 2026-03-06 10:11:04 +00:00
First Commit
This commit is contained in:
503
files/main.py
Normal file
503
files/main.py
Normal file
@@ -0,0 +1,503 @@
|
||||
from langchain_community.chat_models.oci_generative_ai import ChatOCIGenAI
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
from langchain.schema.output_parser import StrOutputParser
|
||||
from langchain_community.embeddings import OCIGenAIEmbeddings
|
||||
from langchain_community.vectorstores import FAISS
|
||||
from langchain.schema.runnable import RunnableMap
|
||||
from langchain_community.document_loaders import UnstructuredPDFLoader, PyMuPDFLoader
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.runnables import RunnableLambda
|
||||
|
||||
from tqdm import tqdm
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
import atexit
|
||||
import oracledb
|
||||
|
||||
# =========================
|
||||
# Oracle Autonomous Configuration
|
||||
# =========================
|
||||
WALLET_PATH = "Wallet_oradb23ai" # Your Wallet for Autonomous Database downloaded and unziped folder
|
||||
DB_ALIAS = "oradb23ai_high" # Your database name plus _high as your TNS Definitions
|
||||
USERNAME = "USERNAME" # Your Wallet username
|
||||
PASSWORD = "PASSWORD" # Your Wallet password
|
||||
os.environ["TNS_ADMIN"] = WALLET_PATH
|
||||
GRAPH_NAME = "my_graph"
|
||||
|
||||
oracle_conn = oracledb.connect(
|
||||
user=USERNAME,
|
||||
password=PASSWORD,
|
||||
dsn=DB_ALIAS,
|
||||
config_dir=WALLET_PATH,
|
||||
wallet_location=WALLET_PATH,
|
||||
wallet_password=PASSWORD
|
||||
)
|
||||
atexit.register(lambda: oracle_conn.close())
|
||||
|
||||
# =========================
|
||||
# Oracle Graph Client
|
||||
# =========================
|
||||
def create_tables_if_not_exist(conn):
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
cursor.execute("""
|
||||
BEGIN
|
||||
EXECUTE IMMEDIATE '
|
||||
CREATE TABLE ENTITIES (
|
||||
ID NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY,
|
||||
NAME VARCHAR2(500)
|
||||
)
|
||||
';
|
||||
EXCEPTION
|
||||
WHEN OTHERS THEN
|
||||
IF SQLCODE != -955 THEN
|
||||
RAISE;
|
||||
END IF;
|
||||
END;
|
||||
""")
|
||||
cursor.execute("""
|
||||
BEGIN
|
||||
EXECUTE IMMEDIATE '
|
||||
CREATE TABLE RELATIONS (
|
||||
ID NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY,
|
||||
SOURCE_ID NUMBER,
|
||||
TARGET_ID NUMBER,
|
||||
RELATION_TYPE VARCHAR2(100),
|
||||
SOURCE_TEXT VARCHAR2(4000)
|
||||
)
|
||||
';
|
||||
EXCEPTION
|
||||
WHEN OTHERS THEN
|
||||
IF SQLCODE != -955 THEN
|
||||
RAISE;
|
||||
END IF;
|
||||
END;
|
||||
""")
|
||||
conn.commit()
|
||||
print("✅ ENTITIES and RELATIONS tables created or already exist.")
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Failed to create tables: {e}")
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
|
||||
create_tables_if_not_exist(oracle_conn)
|
||||
|
||||
# =========================
|
||||
# Global Configurations
|
||||
# =========================
|
||||
INDEX_PATH = "./faiss_index"
|
||||
PROCESSED_DOCS_FILE = os.path.join(INDEX_PATH, "processed_docs.pkl")
|
||||
chapter_separator_regex = r"^(#{1,6} .+|\*\*.+\*\*)$"
|
||||
|
||||
# =========================
|
||||
# LLM Definitions
|
||||
# =========================
|
||||
llm = ChatOCIGenAI(
|
||||
model_id="meta.llama-3.1-405b-instruct",
|
||||
service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
|
||||
compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
|
||||
auth_profile="DEFAULT",
|
||||
model_kwargs={"temperature": 0.7, "top_p": 0.75, "max_tokens": 4000},
|
||||
)
|
||||
|
||||
llm_for_rag = ChatOCIGenAI(
|
||||
model_id="meta.llama-3.1-405b-instruct",
|
||||
service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
|
||||
compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
|
||||
auth_profile="DEFAULT",
|
||||
)
|
||||
|
||||
embeddings = OCIGenAIEmbeddings(
|
||||
model_id="cohere.embed-multilingual-v3.0",
|
||||
service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
|
||||
compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
|
||||
auth_profile="DEFAULT",
|
||||
)
|
||||
|
||||
def create_knowledge_graph(chunks):
|
||||
cursor = oracle_conn.cursor()
|
||||
|
||||
# Creates graph if it does not exist
|
||||
try:
|
||||
cursor.execute(f"""
|
||||
BEGIN
|
||||
EXECUTE IMMEDIATE '
|
||||
CREATE PROPERTY GRAPH {GRAPH_NAME}
|
||||
VERTEX TABLES (ENTITIES
|
||||
KEY (ID)
|
||||
LABEL ENTITIES
|
||||
PROPERTIES (NAME))
|
||||
EDGE TABLES (RELATIONS
|
||||
KEY (ID)
|
||||
SOURCE KEY (SOURCE_ID) REFERENCES ENTITIES(ID)
|
||||
DESTINATION KEY (TARGET_ID) REFERENCES ENTITIES(ID)
|
||||
LABEL RELATIONS
|
||||
PROPERTIES (RELATION_TYPE, SOURCE_TEXT))
|
||||
';
|
||||
EXCEPTION
|
||||
WHEN OTHERS THEN
|
||||
IF SQLCODE != -55358 THEN -- ORA-55358: Graph already exists
|
||||
RAISE;
|
||||
END IF;
|
||||
END;
|
||||
""")
|
||||
print(f"🧠 Graph '{GRAPH_NAME}' created or already exists.")
|
||||
except Exception as e:
|
||||
print(f"[GRAPH ERROR] Failed to create graph: {e}")
|
||||
|
||||
# Inserting vertices and edges into the tables
|
||||
for doc in chunks:
|
||||
text = doc.page_content
|
||||
source = doc.metadata.get("source", "unknown")
|
||||
|
||||
if not text.strip():
|
||||
continue
|
||||
|
||||
prompt = f"""
|
||||
You are an expert in knowledge extraction.
|
||||
|
||||
Given the following technical text:
|
||||
|
||||
{text}
|
||||
|
||||
Extract key entities and relationships in the format:
|
||||
- Entity1 -[RELATION]-> Entity2
|
||||
|
||||
Use UPPERCASE for RELATION types.
|
||||
Return 'NONE' if nothing found.
|
||||
"""
|
||||
try:
|
||||
response = llm_for_rag.invoke(prompt)
|
||||
result = response.content.strip()
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Gen AI call error: {e}")
|
||||
continue
|
||||
|
||||
if result.upper() == "NONE":
|
||||
continue
|
||||
|
||||
triples = result.splitlines()
|
||||
for triple in triples:
|
||||
parts = triple.split("-[")
|
||||
if len(parts) != 2:
|
||||
continue
|
||||
|
||||
right_part = parts[1].split("]->")
|
||||
if len(right_part) != 2:
|
||||
continue
|
||||
|
||||
raw_relation, entity2 = right_part
|
||||
relation = re.sub(r'\W+', '_', raw_relation.strip().upper())
|
||||
entity1 = parts[0].strip()
|
||||
entity2 = entity2.strip()
|
||||
|
||||
try:
|
||||
# Insertion of entities (with existence check)
|
||||
cursor.execute("MERGE INTO ENTITIES e USING (SELECT :name AS NAME FROM dual) src ON (e.name = src.name) WHEN NOT MATCHED THEN INSERT (NAME) VALUES (:name)", [entity1, entity1])
|
||||
cursor.execute("MERGE INTO ENTITIES e USING (SELECT :name AS NAME FROM dual) src ON (e.name = src.name) WHEN NOT MATCHED THEN INSERT (NAME) VALUES (:name)", [entity2, entity2])
|
||||
# Retrieve the IDs
|
||||
cursor.execute("SELECT ID FROM ENTITIES WHERE NAME = :name", [entity1])
|
||||
source_id = cursor.fetchone()[0]
|
||||
cursor.execute("SELECT ID FROM ENTITIES WHERE NAME = :name", [entity2])
|
||||
target_id = cursor.fetchone()[0]
|
||||
# Create relations
|
||||
cursor.execute("""
|
||||
INSERT INTO RELATIONS (SOURCE_ID, TARGET_ID, RELATION_TYPE, SOURCE_TEXT)
|
||||
VALUES (:src, :tgt, :rel, :txt)
|
||||
""", [source_id, target_id, relation, source])
|
||||
print(f"✅ {entity1} -[{relation}]-> {entity2}")
|
||||
except Exception as e:
|
||||
print(f"[INSERT ERROR] {e}")
|
||||
|
||||
oracle_conn.commit()
|
||||
cursor.close()
|
||||
print("💾 Knowledge graph updated.")
|
||||
|
||||
def extract_graph_keywords(question: str) -> str:
|
||||
prompt = f"""
|
||||
Based on the question below, extract relevant keywords (1 to 2 words per term) that can be used to search for entities and relationships in a technical knowledge graph.
|
||||
|
||||
Question: "{question}"
|
||||
|
||||
Rules:
|
||||
- Split compound terms (e.g., "API Gateway" → "API", "Gateway")
|
||||
- Remove duplicates
|
||||
- Do not include generic words such as: "what", "how", "the", "of", "in the document", etc.
|
||||
- Return only the keywords, separated by commas. No explanations.
|
||||
|
||||
Result:
|
||||
"""
|
||||
try:
|
||||
resp = llm_for_rag.invoke(prompt)
|
||||
keywords_raw = resp.content.strip()
|
||||
|
||||
# Additional post-processing: remove duplicates, normalize
|
||||
keywords = {kw.strip().lower() for kw in re.split(r'[,\n]+', keywords_raw)}
|
||||
keywords = [kw for kw in keywords if kw] # remove empty strings
|
||||
return ", ".join(sorted(keywords))
|
||||
except Exception as e:
|
||||
print(f"[KEYWORD EXTRACTION ERROR] {e}")
|
||||
return ""
|
||||
|
||||
def query_knowledge_graph(query_text):
|
||||
cursor = oracle_conn.cursor()
|
||||
|
||||
sanitized_text = query_text.lower()
|
||||
|
||||
pgql = f"""
|
||||
SELECT from_entity,
|
||||
relation_type,
|
||||
to_entity
|
||||
FROM GRAPH_TABLE(
|
||||
{GRAPH_NAME}
|
||||
MATCH (e1 is ENTITIES)-[r is RELATIONS]->(e2 is ENTITIES)
|
||||
WHERE CONTAINS(e1.name, '{sanitized_text}') > 0
|
||||
OR CONTAINS(e2.name, '{sanitized_text}') > 0
|
||||
OR CONTAINS(r.RELATION_TYPE, '{sanitized_text}') > 0
|
||||
COLUMNS (
|
||||
e1.name AS from_entity,
|
||||
r.RELATION_TYPE AS relation_type,
|
||||
e2.name AS to_entity
|
||||
)
|
||||
)
|
||||
FETCH FIRST 20 ROWS ONLY
|
||||
"""
|
||||
|
||||
print(pgql)
|
||||
|
||||
try:
|
||||
cursor.execute(pgql)
|
||||
rows = cursor.fetchall()
|
||||
if not rows:
|
||||
return "⚠️ No relationships found in the graph."
|
||||
|
||||
return "\n".join(f"{r[0]} -[{r[1]}]-> {r[2]}" for r in rows)
|
||||
|
||||
except Exception as e:
|
||||
return f"[PGQL ERROR] {e}"
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
def split_llm_output_into_chapters(llm_text):
|
||||
chapters = []
|
||||
current_chapter = []
|
||||
lines = llm_text.splitlines()
|
||||
|
||||
for line in lines:
|
||||
if re.match(chapter_separator_regex, line):
|
||||
if current_chapter:
|
||||
chapters.append("\n".join(current_chapter).strip())
|
||||
current_chapter = [line]
|
||||
else:
|
||||
current_chapter.append(line)
|
||||
|
||||
if current_chapter:
|
||||
chapters.append("\n".join(current_chapter).strip())
|
||||
|
||||
return chapters
|
||||
|
||||
|
||||
def semantic_chunking(text):
|
||||
prompt = f"""
|
||||
You received the following text extracted via OCR:
|
||||
|
||||
{text}
|
||||
|
||||
Your task:
|
||||
1. Identify headings (short uppercase or bold lines, no period at the end)
|
||||
2. Separate paragraphs by heading
|
||||
3. Indicate columns with [COLUMN 1], [COLUMN 2] if present
|
||||
4. Indicate tables with [TABLE] in markdown format
|
||||
"""
|
||||
|
||||
get_out = False
|
||||
while not get_out:
|
||||
try:
|
||||
response = llm_for_rag.invoke(prompt)
|
||||
get_out = True
|
||||
except:
|
||||
print("[ERROR] Gen AI call error")
|
||||
|
||||
return response
|
||||
|
||||
|
||||
def read_pdfs(pdf_path):
|
||||
if "-ocr" in pdf_path:
|
||||
doc_pages = PyMuPDFLoader(str(pdf_path)).load()
|
||||
else:
|
||||
doc_pages = UnstructuredPDFLoader(str(pdf_path)).load()
|
||||
full_text = "\n".join([page.page_content for page in doc_pages])
|
||||
return full_text
|
||||
|
||||
|
||||
def smart_split_text(text, max_chunk_size=10_000):
|
||||
chunks = []
|
||||
start = 0
|
||||
text_length = len(text)
|
||||
|
||||
while start < text_length:
|
||||
end = min(start + max_chunk_size, text_length)
|
||||
split_point = max(
|
||||
text.rfind('.', start, end),
|
||||
text.rfind('!', start, end),
|
||||
text.rfind('?', start, end),
|
||||
text.rfind('\n\n', start, end)
|
||||
)
|
||||
if split_point == -1 or split_point <= start:
|
||||
split_point = end
|
||||
else:
|
||||
split_point += 1
|
||||
|
||||
chunk = text[start:split_point].strip()
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
|
||||
start = split_point
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def load_previously_indexed_docs():
|
||||
if os.path.exists(PROCESSED_DOCS_FILE):
|
||||
with open(PROCESSED_DOCS_FILE, "rb") as f:
|
||||
return pickle.load(f)
|
||||
return set()
|
||||
|
||||
|
||||
def save_indexed_docs(docs):
|
||||
with open(PROCESSED_DOCS_FILE, "wb") as f:
|
||||
pickle.dump(docs, f)
|
||||
|
||||
|
||||
# =========================
|
||||
# Main Function
|
||||
# =========================
|
||||
def chat():
|
||||
# pdf_paths = [
|
||||
# './Manuals/SOASUITE.pdf',
|
||||
# './Manuals/using-integrations-oracle-integration-3.pdf'
|
||||
# ]
|
||||
|
||||
pdf_paths = ['AAAAAAAAAA.pdf'] # Your PDF Files as a Knowledge Base
|
||||
|
||||
already_indexed_docs = load_previously_indexed_docs()
|
||||
updated_docs = set()
|
||||
|
||||
try:
|
||||
vectorstore = FAISS.load_local(INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
|
||||
print("✔️ FAISS index loaded.")
|
||||
except Exception:
|
||||
print("⚠️ FAISS index not found, creating a new one.")
|
||||
vectorstore = None
|
||||
|
||||
new_chunks = []
|
||||
|
||||
for pdf_path in tqdm(pdf_paths, desc=f"📄 Processing PDFs"):
|
||||
print(f" {os.path.basename(pdf_path)}")
|
||||
if pdf_path in already_indexed_docs:
|
||||
print(f"✅ Document already indexed: {pdf_path}")
|
||||
continue
|
||||
full_text = read_pdfs(pdf_path=pdf_path)
|
||||
|
||||
text_chunks = smart_split_text(full_text, max_chunk_size=10_000)
|
||||
overflow_buffer = ""
|
||||
|
||||
for chunk in tqdm(text_chunks, desc=f"📄 Processing text chunks", dynamic_ncols=True, leave=False):
|
||||
current_text = overflow_buffer + chunk
|
||||
|
||||
treated_text = semantic_chunking(current_text)
|
||||
|
||||
if hasattr(treated_text, "content"):
|
||||
chapters = split_llm_output_into_chapters(treated_text.content)
|
||||
|
||||
last_chapter = chapters[-1] if chapters else ""
|
||||
|
||||
if last_chapter and not last_chapter.strip().endswith((".", "!", "?", "\n\n")):
|
||||
print("📌 Last chapter seems incomplete, saving for the next cycle")
|
||||
overflow_buffer = last_chapter
|
||||
chapters = chapters[:-1]
|
||||
else:
|
||||
overflow_buffer = ""
|
||||
|
||||
for chapter_text in chapters:
|
||||
doc = Document(page_content=chapter_text, metadata={"source": pdf_path})
|
||||
new_chunks.append(doc)
|
||||
print(f"✅ New chapter indexed:\n{chapter_text}...\n")
|
||||
|
||||
else:
|
||||
print(f"[ERROR] semantic_chunking returned unexpected type: {type(treated_text)}")
|
||||
|
||||
updated_docs.add(str(pdf_path))
|
||||
|
||||
if new_chunks:
|
||||
if vectorstore:
|
||||
vectorstore.add_documents(new_chunks)
|
||||
else:
|
||||
vectorstore = FAISS.from_documents(new_chunks, embedding=embeddings)
|
||||
|
||||
vectorstore.save_local(INDEX_PATH)
|
||||
save_indexed_docs(already_indexed_docs.union(updated_docs))
|
||||
print(f"💾 {len(new_chunks)} chunks added to FAISS index.")
|
||||
|
||||
print("🧠 Building knowledge graph...")
|
||||
create_knowledge_graph(new_chunks)
|
||||
|
||||
else:
|
||||
print("📁 No new documents to index.")
|
||||
|
||||
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 50, "fetch_k": 100})
|
||||
|
||||
template = """
|
||||
Document context:
|
||||
{context}
|
||||
|
||||
Graph context:
|
||||
{graph_context}
|
||||
|
||||
Question:
|
||||
{input}
|
||||
|
||||
Interpretation rules:
|
||||
- You can search for a step-by-step tutorial about a subject
|
||||
- You can search a concept description about a subject
|
||||
- You can search for a list of components about a subject
|
||||
"""
|
||||
prompt = PromptTemplate.from_template(template)
|
||||
|
||||
def get_context(x):
|
||||
query = x.get("input") if isinstance(x, dict) else x
|
||||
return retriever.invoke(query)
|
||||
|
||||
chain = (
|
||||
RunnableMap({
|
||||
"context": RunnableLambda(get_context),
|
||||
"graph_context": RunnableLambda(lambda x: query_knowledge_graph(extract_graph_keywords(x.get("input") if isinstance(x, dict) else x))),
|
||||
"input": lambda x: x.get("input") if isinstance(x, dict) else x
|
||||
})
|
||||
| prompt
|
||||
| llm
|
||||
| StrOutputParser()
|
||||
)
|
||||
|
||||
print("✅ READY")
|
||||
|
||||
while True:
|
||||
query = input("❓ Question (or 'quit' to exit): ")
|
||||
if query.lower() == "quit":
|
||||
break
|
||||
response = chain.invoke(query)
|
||||
print("\n📜 RESPONSE:\n")
|
||||
print(response)
|
||||
print("\n" + "=" * 80 + "\n")
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# print("Iniciando")
|
||||
# print(query_knowledge_graph("gateway"))
|
||||
|
||||
# 🚀 Run
|
||||
if __name__ == "__main__":
|
||||
chat()
|
||||
Reference in New Issue
Block a user