Files
rfp_response_automation/files/faiss_to_oracle_vector.py
2026-02-18 20:34:33 -03:00

205 lines
4.4 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
FAISS → Oracle 23ai Vector migration (FULL GOVERNANCE VERSION)
Migra:
- content
- source
- chunk_hash
- origin
- created_at
- status
- embedding
"""
import json
import argparse
import hashlib
import datetime
import oracledb
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from tqdm import tqdm
# =====================================================
# CONFIG
# =====================================================
VECTOR_DIM = 1024
TABLE_NAME = "RAG_DOCS"
BATCH_SIZE = 500
# =====================================================
# CLI
# =====================================================
parser = argparse.ArgumentParser()
parser.add_argument("--faiss", required=True)
parser.add_argument("--dsn", required=True)
parser.add_argument("--user", required=True)
parser.add_argument("--password", required=True)
args = parser.parse_args()
# =====================================================
# HELPERS
# =====================================================
def chunk_hash(text: str) -> str:
return hashlib.sha256(text.encode("utf-8")).hexdigest()
# =====================================================
# 1) LOAD FAISS
# =====================================================
print("🔄 Loading FAISS index...")
dummy_embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
vs = FAISS.load_local(
args.faiss,
dummy_embeddings,
allow_dangerous_deserialization=True
)
docs = vs.docstore._dict
index = vs.index
vectors = index.reconstruct_n(0, index.ntotal)
print(f"✅ Loaded {len(docs)} vectors")
# =========================
# Oracle Autonomous Configuration
# =========================
WALLET_PATH = "Wallet_oradb23aiDev"
DB_ALIAS = "oradb23aiDev_high"
USERNAME = "admin"
PASSWORD = "Moniquinha1972"
os.environ["TNS_ADMIN"] = WALLET_PATH
# =====================================================
# 2) CONNECT ORACLE
# =====================================================
print("🔌 Connecting to Oracle...")
conn = oracledb.connect(
user=USERNAME,
password=PASSWORD,
dsn=DB_ALIAS,
config_dir=WALLET_PATH,
wallet_location=WALLET_PATH,
wallet_password=PASSWORD
)
cur = conn.cursor()
# =====================================================
# 3) CREATE TABLE (FULL SCHEMA)
# =====================================================
print("📦 Creating table if not exists...")
cur.execute(f"""
BEGIN
EXECUTE IMMEDIATE '
CREATE TABLE {TABLE_NAME} (
ID NUMBER GENERATED BY DEFAULT AS IDENTITY,
CONTENT CLOB,
SOURCE VARCHAR2(1000),
CHUNK_HASH VARCHAR2(64),
STATUS VARCHAR2(20),
ORIGIN VARCHAR2(50),
CREATED_AT TIMESTAMP,
EMBED VECTOR({VECTOR_DIM})
)';
EXCEPTION
WHEN OTHERS THEN
IF SQLCODE != -955 THEN RAISE;
END;
""")
conn.commit()
# =====================================================
# 4) INSERT BATCH
# =====================================================
print("⬆️ Migrating vectors...")
sql = f"""
INSERT INTO {TABLE_NAME}
(CONTENT, SOURCE, CHUNK_HASH, STATUS, ORIGIN, CREATED_AT, EMBED)
VALUES (:1, :2, :3, :4, :5, :6, :7)
"""
batch = []
for i, (doc_id, doc) in enumerate(tqdm(docs.items())):
content = doc.page_content
source = doc.metadata.get("source", "")
origin = doc.metadata.get("origin", "FAISS")
created = doc.metadata.get(
"created_at",
datetime.datetime.utcnow()
)
h = doc.metadata.get("chunk_hash") or chunk_hash(content)
batch.append((
content,
source,
h,
"ACTIVE",
origin,
created,
json.dumps(vectors[i].tolist())
))
if len(batch) >= BATCH_SIZE:
cur.executemany(sql, batch)
batch.clear()
if batch:
cur.executemany(sql, batch)
conn.commit()
print("✅ Insert finished")
# =====================================================
# 5) CREATE VECTOR INDEX
# =====================================================
print("⚡ Creating HNSW index...")
cur.execute(f"""
BEGIN
EXECUTE IMMEDIATE '
CREATE VECTOR INDEX {TABLE_NAME}_IDX
ON {TABLE_NAME}(EMBED)
ORGANIZATION HNSW
DISTANCE COSINE
';
EXCEPTION
WHEN OTHERS THEN
IF SQLCODE != -955 THEN RAISE;
END;
""")
conn.commit()
print("🎉 Migration complete!")