#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ FAISS → Oracle 23ai Vector migration (FULL GOVERNANCE VERSION) Migra: - content - source - chunk_hash - origin - created_at - status - embedding """ import json import argparse import hashlib import datetime import oracledb from langchain.vectorstores import FAISS from langchain.embeddings import HuggingFaceEmbeddings from tqdm import tqdm # ===================================================== # CONFIG # ===================================================== VECTOR_DIM = 1024 TABLE_NAME = "RAG_DOCS" BATCH_SIZE = 500 # ===================================================== # CLI # ===================================================== parser = argparse.ArgumentParser() parser.add_argument("--faiss", required=True) parser.add_argument("--dsn", required=True) parser.add_argument("--user", required=True) parser.add_argument("--password", required=True) args = parser.parse_args() # ===================================================== # HELPERS # ===================================================== def chunk_hash(text: str) -> str: return hashlib.sha256(text.encode("utf-8")).hexdigest() # ===================================================== # 1) LOAD FAISS # ===================================================== print("🔄 Loading FAISS index...") dummy_embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2" ) vs = FAISS.load_local( args.faiss, dummy_embeddings, allow_dangerous_deserialization=True ) docs = vs.docstore._dict index = vs.index vectors = index.reconstruct_n(0, index.ntotal) print(f"✅ Loaded {len(docs)} vectors") # ========================= # Oracle Autonomous Configuration # ========================= WALLET_PATH = "Wallet_oradb23aiDev" DB_ALIAS = "oradb23aiDev_high" USERNAME = "admin" PASSWORD = "Moniquinha1972" os.environ["TNS_ADMIN"] = WALLET_PATH # ===================================================== # 2) CONNECT ORACLE # ===================================================== print("🔌 Connecting to Oracle...") conn = oracledb.connect( user=USERNAME, password=PASSWORD, dsn=DB_ALIAS, config_dir=WALLET_PATH, wallet_location=WALLET_PATH, wallet_password=PASSWORD ) cur = conn.cursor() # ===================================================== # 3) CREATE TABLE (FULL SCHEMA) # ===================================================== print("📦 Creating table if not exists...") cur.execute(f""" BEGIN EXECUTE IMMEDIATE ' CREATE TABLE {TABLE_NAME} ( ID NUMBER GENERATED BY DEFAULT AS IDENTITY, CONTENT CLOB, SOURCE VARCHAR2(1000), CHUNK_HASH VARCHAR2(64), STATUS VARCHAR2(20), ORIGIN VARCHAR2(50), CREATED_AT TIMESTAMP, EMBED VECTOR({VECTOR_DIM}) )'; EXCEPTION WHEN OTHERS THEN IF SQLCODE != -955 THEN RAISE; END; """) conn.commit() # ===================================================== # 4) INSERT BATCH # ===================================================== print("⬆️ Migrating vectors...") sql = f""" INSERT INTO {TABLE_NAME} (CONTENT, SOURCE, CHUNK_HASH, STATUS, ORIGIN, CREATED_AT, EMBED) VALUES (:1, :2, :3, :4, :5, :6, :7) """ batch = [] for i, (doc_id, doc) in enumerate(tqdm(docs.items())): content = doc.page_content source = doc.metadata.get("source", "") origin = doc.metadata.get("origin", "FAISS") created = doc.metadata.get( "created_at", datetime.datetime.utcnow() ) h = doc.metadata.get("chunk_hash") or chunk_hash(content) batch.append(( content, source, h, "ACTIVE", origin, created, json.dumps(vectors[i].tolist()) )) if len(batch) >= BATCH_SIZE: cur.executemany(sql, batch) batch.clear() if batch: cur.executemany(sql, batch) conn.commit() print("✅ Insert finished") # ===================================================== # 5) CREATE VECTOR INDEX # ===================================================== print("⚡ Creating HNSW index...") cur.execute(f""" BEGIN EXECUTE IMMEDIATE ' CREATE VECTOR INDEX {TABLE_NAME}_IDX ON {TABLE_NAME}(EMBED) ORGANIZATION HNSW DISTANCE COSINE '; EXCEPTION WHEN OTHERS THEN IF SQLCODE != -955 THEN RAISE; END; """) conn.commit() print("🎉 Migration complete!")