mirror of
https://github.com/hoshikawa2/rfp_response_automation.git
synced 2026-03-03 16:09:35 +00:00
205 lines
4.4 KiB
Python
205 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
FAISS → Oracle 23ai Vector migration (FULL GOVERNANCE VERSION)
|
|
|
|
Migra:
|
|
- content
|
|
- source
|
|
- chunk_hash
|
|
- origin
|
|
- created_at
|
|
- status
|
|
- embedding
|
|
|
|
"""
|
|
|
|
import json
|
|
import argparse
|
|
import hashlib
|
|
import datetime
|
|
import oracledb
|
|
|
|
from langchain.vectorstores import FAISS
|
|
from langchain.embeddings import HuggingFaceEmbeddings
|
|
from tqdm import tqdm
|
|
|
|
|
|
# =====================================================
|
|
# CONFIG
|
|
# =====================================================
|
|
|
|
VECTOR_DIM = 1024
|
|
TABLE_NAME = "RAG_DOCS"
|
|
BATCH_SIZE = 500
|
|
|
|
|
|
# =====================================================
|
|
# CLI
|
|
# =====================================================
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--faiss", required=True)
|
|
parser.add_argument("--dsn", required=True)
|
|
parser.add_argument("--user", required=True)
|
|
parser.add_argument("--password", required=True)
|
|
args = parser.parse_args()
|
|
|
|
|
|
# =====================================================
|
|
# HELPERS
|
|
# =====================================================
|
|
|
|
def chunk_hash(text: str) -> str:
|
|
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
|
|
|
|
# =====================================================
|
|
# 1) LOAD FAISS
|
|
# =====================================================
|
|
|
|
print("🔄 Loading FAISS index...")
|
|
|
|
dummy_embeddings = HuggingFaceEmbeddings(
|
|
model_name="sentence-transformers/all-MiniLM-L6-v2"
|
|
)
|
|
|
|
vs = FAISS.load_local(
|
|
args.faiss,
|
|
dummy_embeddings,
|
|
allow_dangerous_deserialization=True
|
|
)
|
|
|
|
docs = vs.docstore._dict
|
|
index = vs.index
|
|
vectors = index.reconstruct_n(0, index.ntotal)
|
|
|
|
print(f"✅ Loaded {len(docs)} vectors")
|
|
|
|
# =========================
|
|
# Oracle Autonomous Configuration
|
|
# =========================
|
|
WALLET_PATH = "Wallet_oradb23aiDev"
|
|
DB_ALIAS = "oradb23aiDev_high"
|
|
USERNAME = "admin"
|
|
PASSWORD = "Moniquinha1972"
|
|
os.environ["TNS_ADMIN"] = WALLET_PATH
|
|
|
|
# =====================================================
|
|
# 2) CONNECT ORACLE
|
|
# =====================================================
|
|
|
|
print("🔌 Connecting to Oracle...")
|
|
|
|
conn = oracledb.connect(
|
|
user=USERNAME,
|
|
password=PASSWORD,
|
|
dsn=DB_ALIAS,
|
|
config_dir=WALLET_PATH,
|
|
wallet_location=WALLET_PATH,
|
|
wallet_password=PASSWORD
|
|
)
|
|
|
|
cur = conn.cursor()
|
|
|
|
|
|
# =====================================================
|
|
# 3) CREATE TABLE (FULL SCHEMA)
|
|
# =====================================================
|
|
|
|
print("📦 Creating table if not exists...")
|
|
|
|
cur.execute(f"""
|
|
BEGIN
|
|
EXECUTE IMMEDIATE '
|
|
CREATE TABLE {TABLE_NAME} (
|
|
ID NUMBER GENERATED BY DEFAULT AS IDENTITY,
|
|
CONTENT CLOB,
|
|
SOURCE VARCHAR2(1000),
|
|
CHUNK_HASH VARCHAR2(64),
|
|
STATUS VARCHAR2(20),
|
|
ORIGIN VARCHAR2(50),
|
|
CREATED_AT TIMESTAMP,
|
|
EMBED VECTOR({VECTOR_DIM})
|
|
)';
|
|
EXCEPTION
|
|
WHEN OTHERS THEN
|
|
IF SQLCODE != -955 THEN RAISE;
|
|
END;
|
|
""")
|
|
|
|
conn.commit()
|
|
|
|
|
|
# =====================================================
|
|
# 4) INSERT BATCH
|
|
# =====================================================
|
|
|
|
print("⬆️ Migrating vectors...")
|
|
|
|
sql = f"""
|
|
INSERT INTO {TABLE_NAME}
|
|
(CONTENT, SOURCE, CHUNK_HASH, STATUS, ORIGIN, CREATED_AT, EMBED)
|
|
VALUES (:1, :2, :3, :4, :5, :6, :7)
|
|
"""
|
|
|
|
batch = []
|
|
|
|
for i, (doc_id, doc) in enumerate(tqdm(docs.items())):
|
|
|
|
content = doc.page_content
|
|
source = doc.metadata.get("source", "")
|
|
origin = doc.metadata.get("origin", "FAISS")
|
|
created = doc.metadata.get(
|
|
"created_at",
|
|
datetime.datetime.utcnow()
|
|
)
|
|
|
|
h = doc.metadata.get("chunk_hash") or chunk_hash(content)
|
|
|
|
batch.append((
|
|
content,
|
|
source,
|
|
h,
|
|
"ACTIVE",
|
|
origin,
|
|
created,
|
|
json.dumps(vectors[i].tolist())
|
|
))
|
|
|
|
if len(batch) >= BATCH_SIZE:
|
|
cur.executemany(sql, batch)
|
|
batch.clear()
|
|
|
|
if batch:
|
|
cur.executemany(sql, batch)
|
|
|
|
conn.commit()
|
|
|
|
print("✅ Insert finished")
|
|
|
|
|
|
# =====================================================
|
|
# 5) CREATE VECTOR INDEX
|
|
# =====================================================
|
|
|
|
print("⚡ Creating HNSW index...")
|
|
|
|
cur.execute(f"""
|
|
BEGIN
|
|
EXECUTE IMMEDIATE '
|
|
CREATE VECTOR INDEX {TABLE_NAME}_IDX
|
|
ON {TABLE_NAME}(EMBED)
|
|
ORGANIZATION HNSW
|
|
DISTANCE COSINE
|
|
';
|
|
EXCEPTION
|
|
WHEN OTHERS THEN
|
|
IF SQLCODE != -955 THEN RAISE;
|
|
END;
|
|
""")
|
|
|
|
conn.commit()
|
|
|
|
print("🎉 Migration complete!") |