mirror of
https://github.com/hoshikawa2/oci_genai_pdf.git
synced 2026-03-06 02:10:39 +00:00
adjusmtents
This commit is contained in:
258
files/oci_genai_llm_context.py
Normal file
258
files/oci_genai_llm_context.py
Normal file
@@ -0,0 +1,258 @@
|
||||
from langchain_community.chat_models.oci_generative_ai import ChatOCIGenAI
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
from langchain.schema.output_parser import StrOutputParser
|
||||
from langchain_community.embeddings import OCIGenAIEmbeddings
|
||||
from langchain_community.vectorstores import FAISS
|
||||
from langchain.schema.runnable import RunnableMap
|
||||
from langchain_community.document_loaders import PyPDFLoader, UnstructuredPowerPointLoader, UnstructuredPDFLoader, PyMuPDFLoader
|
||||
from langchain_core.documents import Document
|
||||
from tqdm import tqdm
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
from langchain_core.runnables import RunnableLambda
|
||||
|
||||
INDEX_PATH = "./faiss_index"
|
||||
PROCESSED_DOCS_FILE = os.path.join(INDEX_PATH, "processed_docs.pkl")
|
||||
|
||||
chapter_separator_regex = r"^(#{1,6} .+|\*\*.+\*\*)$"
|
||||
|
||||
def split_llm_output_into_chapters(llm_text):
|
||||
"""
|
||||
Splits the LLM output text into chapters, assuming the LLM separates chapters using markdown-style headings like '# Title'
|
||||
"""
|
||||
chapters = []
|
||||
current_chapter = []
|
||||
lines = llm_text.splitlines()
|
||||
|
||||
for line in lines:
|
||||
if re.match(chapter_separator_regex, line):
|
||||
if current_chapter:
|
||||
chapters.append("\n".join(current_chapter).strip())
|
||||
current_chapter = [line]
|
||||
else:
|
||||
current_chapter.append(line)
|
||||
|
||||
if current_chapter:
|
||||
chapters.append("\n".join(current_chapter).strip())
|
||||
|
||||
return chapters
|
||||
|
||||
def semantic_chunking(text):
|
||||
llm = ChatOCIGenAI(
|
||||
model_id="meta.llama-3.1-405b-instruct",
|
||||
service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
|
||||
compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
|
||||
auth_profile="DEFAULT",
|
||||
)
|
||||
|
||||
prompt = f"""
|
||||
You received the following text extracted via OCR:
|
||||
|
||||
{text}
|
||||
|
||||
Your task:
|
||||
1. Identify headings (short uppercase or bold lines, no period at the end)
|
||||
2. Separate paragraphs by heading
|
||||
3. Indicate columns with [COLUMN 1], [COLUMN 2] if present
|
||||
4. Indicate tables with [TABLE] in markdown format
|
||||
"""
|
||||
|
||||
response = llm.invoke(prompt)
|
||||
return response
|
||||
|
||||
def read_pdfs(pdf_path):
|
||||
if "-ocr" in pdf_path:
|
||||
doc_pages = PyMuPDFLoader(str(pdf_path)).load()
|
||||
else:
|
||||
doc_pages = UnstructuredPDFLoader(str(pdf_path)).load()
|
||||
full_text = "\n".join([page.page_content for page in doc_pages])
|
||||
return full_text
|
||||
|
||||
def smart_split_text(text, max_chunk_size=10_000):
|
||||
chunks = []
|
||||
start = 0
|
||||
text_length = len(text)
|
||||
|
||||
while start < text_length:
|
||||
end = min(start + max_chunk_size, text_length)
|
||||
|
||||
# Try to find the last sentence end before the limit (., ?, !, \n\n)
|
||||
split_point = max(
|
||||
text.rfind('.', start, end),
|
||||
text.rfind('!', start, end),
|
||||
text.rfind('?', start, end),
|
||||
text.rfind('\n\n', start, end)
|
||||
)
|
||||
|
||||
# If not found, make a hard cut
|
||||
if split_point == -1 or split_point <= start:
|
||||
split_point = end
|
||||
else:
|
||||
split_point += 1 # Include the ending character
|
||||
|
||||
chunk = text[start:split_point].strip()
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
|
||||
start = split_point
|
||||
|
||||
return chunks
|
||||
|
||||
def load_previously_indexed_docs():
|
||||
if os.path.exists(PROCESSED_DOCS_FILE):
|
||||
with open(PROCESSED_DOCS_FILE, "rb") as f:
|
||||
return pickle.load(f)
|
||||
return set()
|
||||
|
||||
def save_indexed_docs(docs):
|
||||
with open(PROCESSED_DOCS_FILE, "wb") as f:
|
||||
pickle.dump(docs, f)
|
||||
|
||||
def append_text_to_file(file_path, text):
|
||||
"""
|
||||
Appends text to the end of a file.
|
||||
If the file doesn't exist, it will be created.
|
||||
|
||||
Args:
|
||||
file_path (str): Path to the file where the text will be saved.
|
||||
text (str): Text to append.
|
||||
"""
|
||||
with open(file_path, "a", encoding="utf-8") as f:
|
||||
f.write(text + "\n")
|
||||
|
||||
def chat():
|
||||
llm = ChatOCIGenAI(
|
||||
model_id="meta.llama-3.1-405b-instruct",
|
||||
service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
|
||||
compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
|
||||
auth_profile="DEFAULT", # Replace with your profile name,
|
||||
model_kwargs={"temperature": 0.7, "top_p": 0.75, "max_tokens": 4000},
|
||||
)
|
||||
|
||||
embeddings = OCIGenAIEmbeddings(
|
||||
model_id="cohere.embed-multilingual-v3.0",
|
||||
service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
|
||||
compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
|
||||
auth_profile="DEFAULT", # Replace with your profile name,
|
||||
)
|
||||
|
||||
pdf_paths = [
|
||||
'./Manuals/using-integrations-oracle-integration-3.pdf',
|
||||
'./Manuals/SOASE.pdf',
|
||||
'./Manuals/SOASUITEHL7.pdf'
|
||||
]
|
||||
|
||||
already_indexed_docs = load_previously_indexed_docs()
|
||||
updated_docs = set()
|
||||
|
||||
# Try loading existing FAISS index
|
||||
try:
|
||||
vectorstore = FAISS.load_local(INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
|
||||
print("✔️ FAISS index loaded.")
|
||||
except Exception:
|
||||
print("⚠️ FAISS index not found, creating a new one.")
|
||||
vectorstore = None
|
||||
|
||||
new_chunks = []
|
||||
|
||||
pages = []
|
||||
for pdf_path in tqdm(pdf_paths, desc=f"📄 Processing PDFs"):
|
||||
print(f" {os.path.basename(pdf_path)}")
|
||||
if pdf_path in already_indexed_docs:
|
||||
print(f"✅ Document already indexed: {pdf_path}")
|
||||
continue
|
||||
full_text = read_pdfs(pdf_path=pdf_path)
|
||||
|
||||
# Split the text into ~10 KB chunks (~10,000 characters)
|
||||
text_chunks = smart_split_text(full_text, max_chunk_size=10_000)
|
||||
overflow_buffer = "" # Remainder from the previous chapter, if any
|
||||
|
||||
for chunk in tqdm(text_chunks, desc=f"📄 Processing text chunks", dynamic_ncols=True, leave=False):
|
||||
# Join with leftover from previous chunk
|
||||
current_text = overflow_buffer + chunk
|
||||
|
||||
# Send text to LLM for semantic splitting
|
||||
treated_text = semantic_chunking(current_text)
|
||||
|
||||
if hasattr(treated_text, "content"):
|
||||
chapters = split_llm_output_into_chapters(treated_text.content)
|
||||
|
||||
# Check if the last chapter seems incomplete
|
||||
last_chapter = chapters[-1] if chapters else ""
|
||||
|
||||
# Simple criteria: if text ends without punctuation (like . ! ?) or is too short
|
||||
if last_chapter and not last_chapter.strip().endswith((".", "!", "?", "\n\n")):
|
||||
print("📌 Last chapter seems incomplete, saving for the next cycle")
|
||||
overflow_buffer = last_chapter
|
||||
chapters = chapters[:-1] # Don't index the last incomplete chapter yet
|
||||
else:
|
||||
overflow_buffer = "" # Nothing left over
|
||||
|
||||
# Save complete chapters as document chunks
|
||||
for chapter_text in chapters:
|
||||
doc = Document(page_content=chapter_text, metadata={"source": pdf_path})
|
||||
new_chunks.append(doc)
|
||||
print(f"✅ New chapter indexed:\n{chapter_text}...\n")
|
||||
|
||||
else:
|
||||
print(f"[ERROR] semantic_chunking returned unexpected type: {type(treated_text)}")
|
||||
|
||||
updated_docs.add(str(pdf_path))
|
||||
|
||||
# If there are new documents, index them
|
||||
if new_chunks:
|
||||
if vectorstore:
|
||||
vectorstore.add_documents(new_chunks)
|
||||
else:
|
||||
vectorstore = FAISS.from_documents(new_chunks, embedding=embeddings)
|
||||
|
||||
vectorstore.save_local(INDEX_PATH)
|
||||
save_indexed_docs(already_indexed_docs.union(updated_docs))
|
||||
print(f"💾 {len(new_chunks)} chunks added to FAISS index.")
|
||||
else:
|
||||
print("📁 No new documents to index.")
|
||||
|
||||
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 50, "fetch_k": 100})
|
||||
|
||||
template = """
|
||||
Document context:
|
||||
{context}
|
||||
|
||||
Question:
|
||||
{input}
|
||||
|
||||
Interpretation rules:
|
||||
Rule 1: SOA SUITE documents: `SOASUITE.pdf` and `SOASUITEHL7.pdf`
|
||||
Rule 2: Oracle Integration (known as OIC) document: `using-integrations-oracle-integration-3.pdf`
|
||||
Rule 3: If the query is not a comparison between SOA SUITE and Oracle Integration (OIC), only consider documents relevant to the product.
|
||||
Rule 4: If the question is a comparison between SOA SUITE and OIC, consider all documents and compare between them.
|
||||
Mention at the beginning which tool is being addressed: {input}
|
||||
"""
|
||||
prompt = PromptTemplate.from_template(template)
|
||||
|
||||
def get_context(x):
|
||||
query = x.get("input") if isinstance(x, dict) else x
|
||||
return retriever.invoke(query)
|
||||
|
||||
chain = (
|
||||
RunnableMap({
|
||||
"context": RunnableLambda(get_context),
|
||||
"input": lambda x: x.get("input") if isinstance(x, dict) else x
|
||||
})
|
||||
| prompt
|
||||
| llm
|
||||
| StrOutputParser()
|
||||
)
|
||||
|
||||
print("READY")
|
||||
|
||||
while True:
|
||||
query = input()
|
||||
if query == "quit":
|
||||
break
|
||||
response = chain.invoke(query)
|
||||
print(type(response)) # <class 'str'>
|
||||
print(response)
|
||||
|
||||
chat()
|
||||
164
files/oci_genai_llm_context_fast.py
Normal file
164
files/oci_genai_llm_context_fast.py
Normal file
@@ -0,0 +1,164 @@
|
||||
from langchain_community.chat_models.oci_generative_ai import ChatOCIGenAI
|
||||
from langchain_core.prompts import PromptTemplate
|
||||
from langchain.schema.output_parser import StrOutputParser
|
||||
from langchain_community.embeddings import OCIGenAIEmbeddings
|
||||
from langchain_community.vectorstores import FAISS
|
||||
from langchain.schema.runnable import RunnableMap
|
||||
from langchain_community.document_loaders import PyPDFLoader, UnstructuredPowerPointLoader, UnstructuredPDFLoader, PyMuPDFLoader
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.runnables import RunnableLambda
|
||||
from tqdm import tqdm
|
||||
import os
|
||||
import pickle
|
||||
|
||||
INDEX_PATH = "./faiss_index"
|
||||
PROCESSED_DOCS_FILE = os.path.join(INDEX_PATH, "processed_docs.pkl")
|
||||
|
||||
def read_pdfs(pdf_path):
|
||||
if "-ocr" in pdf_path:
|
||||
doc_pages = PyMuPDFLoader(str(pdf_path)).load()
|
||||
else:
|
||||
doc_pages = UnstructuredPDFLoader(str(pdf_path)).load()
|
||||
full_text = "\n".join([page.page_content for page in doc_pages])
|
||||
return full_text
|
||||
|
||||
def smart_split_text(text, max_chunk_size=2000):
|
||||
chunks = []
|
||||
start = 0
|
||||
text_length = len(text)
|
||||
|
||||
while start < text_length:
|
||||
end = min(start + max_chunk_size, text_length)
|
||||
split_point = max(
|
||||
text.rfind('.', start, end),
|
||||
text.rfind('!', start, end),
|
||||
text.rfind('?', start, end),
|
||||
text.rfind('\n\n', start, end)
|
||||
)
|
||||
|
||||
if split_point == -1 or split_point <= start:
|
||||
split_point = end
|
||||
else:
|
||||
split_point += 1
|
||||
|
||||
chunk = text[start:split_point].strip()
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
|
||||
start = split_point
|
||||
|
||||
return chunks
|
||||
|
||||
def load_previously_indexed_docs():
|
||||
if os.path.exists(PROCESSED_DOCS_FILE):
|
||||
with open(PROCESSED_DOCS_FILE, "rb") as f:
|
||||
return pickle.load(f)
|
||||
return set()
|
||||
|
||||
def save_indexed_docs(docs):
|
||||
with open(PROCESSED_DOCS_FILE, "wb") as f:
|
||||
pickle.dump(docs, f)
|
||||
|
||||
def chat():
|
||||
llm = ChatOCIGenAI(
|
||||
model_id="meta.llama-3.1-405b-instruct",
|
||||
service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
|
||||
compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
|
||||
auth_profile="DEFAULT",
|
||||
model_kwargs={"temperature": 0.7, "top_p": 0.75, "max_tokens": 4000},
|
||||
)
|
||||
|
||||
embeddings = OCIGenAIEmbeddings(
|
||||
model_id="cohere.embed-multilingual-v3.0",
|
||||
service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
|
||||
compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
|
||||
auth_profile="DEFAULT",
|
||||
)
|
||||
|
||||
pdf_paths = [
|
||||
'./Manuals/using-integrations-oracle-integration-3.pdf',
|
||||
'./Manuals/SOASE.pdf',
|
||||
'./Manuals/SOASUITEHL7.pdf'
|
||||
]
|
||||
|
||||
already_indexed_docs = load_previously_indexed_docs()
|
||||
updated_docs = set()
|
||||
|
||||
try:
|
||||
vectorstore = FAISS.load_local(INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
|
||||
print("✔️ FAISS index loaded.")
|
||||
except Exception:
|
||||
print("⚠️ FAISS index not found, creating a new one.")
|
||||
vectorstore = None
|
||||
|
||||
new_chunks = []
|
||||
|
||||
for pdf_path in tqdm(pdf_paths, desc="📄 Processing PDFs"):
|
||||
print(f" {os.path.basename(pdf_path)}")
|
||||
if pdf_path in already_indexed_docs:
|
||||
print(f"✅ Already indexed: {pdf_path}")
|
||||
continue
|
||||
|
||||
full_text = read_pdfs(pdf_path=pdf_path)
|
||||
text_chunks = smart_split_text(full_text, max_chunk_size=2000)
|
||||
|
||||
for chunk_text in tqdm(text_chunks, desc=f"📄 Splitting text", dynamic_ncols=True, leave=False):
|
||||
doc = Document(page_content=chunk_text, metadata={"source": pdf_path})
|
||||
new_chunks.append(doc)
|
||||
print(f"✅ Indexed chunk with {len(chunk_text)} chars.")
|
||||
|
||||
updated_docs.add(str(pdf_path))
|
||||
|
||||
if new_chunks:
|
||||
if vectorstore:
|
||||
vectorstore.add_documents(new_chunks)
|
||||
else:
|
||||
vectorstore = FAISS.from_documents(new_chunks, embedding=embeddings)
|
||||
|
||||
vectorstore.save_local(INDEX_PATH)
|
||||
save_indexed_docs(already_indexed_docs.union(updated_docs))
|
||||
print(f"💾 {len(new_chunks)} chunks saved to FAISS index.")
|
||||
else:
|
||||
print("📁 No new documents to index.")
|
||||
|
||||
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 50, "fetch_k": 100})
|
||||
|
||||
template = """
|
||||
Document context:
|
||||
{context}
|
||||
|
||||
Question:
|
||||
{input}
|
||||
|
||||
Interpretation rules:
|
||||
Rule 1: SOA SUITE documents: `SOASUITE.pdf` and `SOASUITEHL7.pdf`
|
||||
Rule 2: Oracle Integration (OIC) document: `using-integrations-oracle-integration-3.pdf`
|
||||
Rule 3: If not a comparison between SOA SUITE and OIC, only consider documents relevant to the product.
|
||||
Rule 4: If the question compares SOA SUITE and OIC, compare both.
|
||||
Mention at the beginning which tool is being addressed: {input}
|
||||
"""
|
||||
prompt = PromptTemplate.from_template(template)
|
||||
|
||||
def get_context(x):
|
||||
query = x.get("input") if isinstance(x, dict) else x
|
||||
return retriever.invoke(query)
|
||||
|
||||
chain = (
|
||||
RunnableMap({
|
||||
"context": RunnableLambda(get_context),
|
||||
"input": lambda x: x.get("input") if isinstance(x, dict) else x
|
||||
})
|
||||
| prompt
|
||||
| llm
|
||||
| StrOutputParser()
|
||||
)
|
||||
|
||||
print("READY")
|
||||
while True:
|
||||
query = input()
|
||||
if query == "quit":
|
||||
break
|
||||
response = chain.invoke(query)
|
||||
print(response)
|
||||
|
||||
chat()
|
||||
16
files/requirements.txt
Normal file
16
files/requirements.txt
Normal file
@@ -0,0 +1,16 @@
|
||||
langchain==0.2.0
|
||||
langchain-community==0.0.30
|
||||
langchain-core==0.2.0
|
||||
tqdm
|
||||
faiss-cpu
|
||||
unstructured[pdf,ppt]==0.13.2
|
||||
PyMuPDF==1.24.1
|
||||
PyPDF2==3.0.1
|
||||
ocrmypdf==14.1.0 # opcional, se quiser OCR fallback
|
||||
pypandoc # necessário para alguns loaders .pptx
|
||||
pillow
|
||||
python-docx
|
||||
chardet
|
||||
lxml
|
||||
oci
|
||||
oci-cli
|
||||
Reference in New Issue
Block a user