adjustments

2026-03-06 10:11:01 +00:00 · 2025-05-14 08:31:50 -03:00
parent 74b73abc15
commit 4d1b7f24c7
3 changed files with 177 additions and 85 deletions
--- a/source/process_vector_products.py
+++ b/source/process_vector_products.py
@@ -1,9 +1,7 @@
 import oracledb
 import os
 from sentence_transformers import SentenceTransformer
-import faiss
 import numpy as np
-import pickle

 # === CONFIGURAÇÃO ORACLE COM WALLET ===
 WALLET_PATH = "/WALLET_PATH/Wallet_oradb23ai"
@@ -14,7 +12,14 @@ PASSWORD = "Password"
 os.environ["TNS_ADMIN"] = WALLET_PATH

 # === CONECTANDO USANDO oracledb (modo thin) ===
-connection = oracledb.connect(user=USERNAME, password=PASSWORD, dsn=DB_ALIAS, config_dir=WALLET_PATH, wallet_location=WALLET_PATH, wallet_password=PASSWORD)
+connection = oracledb.connect(
+    user=USERNAME,
+    password=PASSWORD,
+    dsn=DB_ALIAS,
+    config_dir=WALLET_PATH,
+    wallet_location=WALLET_PATH,
+    wallet_password=PASSWORD
+)

 cursor = connection.cursor()

@@ -26,22 +31,52 @@ ids = []
 descricoes = []

 for row in rows:
-    ids.append({"id": row[0], "codigo": row[1], "descricao": row[2]})
-    descricoes.append(row[2])  # Usado no embedding
+    ids.append((row[0], row[1], row[2]))
+    descricoes.append(row[2])

-# === GERAÇÃO DE EMBEDDINGS COM SENTENCE TRANSFORMERS ===
+# === GERAÇÃO DOS EMBEDDINGS ===
 model = SentenceTransformer('all-MiniLM-L6-v2')
 embeddings = model.encode(descricoes, convert_to_numpy=True)

-# === CRIAÇÃO DO ÍNDICE FAISS ===
-dim = embeddings.shape[1]
-index = faiss.IndexFlatL2(dim)
-index.add(embeddings)
+# === CRIAÇÃO DA TABELA DE EMBEDDINGS (caso não exista) ===
+cursor.execute("""
+               BEGIN
+                   EXECUTE IMMEDIATE '
+            CREATE TABLE embeddings_produtos (
+                id NUMBER PRIMARY KEY,
+                codigo VARCHAR2(100),
+                descricao VARCHAR2(4000),
+                vetor BLOB
+            )';
+               EXCEPTION
+                   WHEN OTHERS THEN
+                       IF SQLCODE != -955 THEN
+                           RAISE;
+                       END IF;
+               END;
+               """)

-# === SALVANDO O ÍNDICE E O MAPA DE PRODUTOS ===
-faiss.write_index(index, "faiss_index.bin")
+# === INSERÇÃO OU ATUALIZAÇÃO DOS DADOS ===
+for (id_, codigo, descricao), vetor in zip(ids, embeddings):
+    vetor_bytes = vetor.astype(np.float32).tobytes()
+    cursor.execute("""
+        MERGE INTO embeddings_produtos tgt
+        USING (SELECT :id AS id FROM dual) src
+        ON (tgt.id = src.id)
+        WHEN MATCHED THEN
+            UPDATE SET codigo = :codigo, descricao = :descricao, vetor = :vetor
+        WHEN NOT MATCHED THEN
+            INSERT (id, codigo, descricao, vetor)
+            VALUES (:id, :codigo, :descricao, :vetor)
+    """, {
+        "id": id_,
+        "codigo": codigo,
+        "descricao": descricao,
+        "vetor": vetor_bytes
+    })

-with open("produto_id_map.pkl", "wb") as f:
-    pickle.dump(ids, f)
+connection.commit()
+cursor.close()
+connection.close()

-print("✅ Vetores gerados e salvos com sucesso.")
+print("✅ Vetores gravados com sucesso no banco Oracle.")
--- a/source/product_search.py
+++ b/source/product_search.py
@@ -1,9 +1,7 @@
-# product_search.py
-
-import faiss
-import pickle
-import difflib
+import os
+import oracledb
 import numpy as np
+import difflib
 from rapidfuzz import fuzz
 from langchain_community.embeddings import OCIGenAIEmbeddings

@@ -11,19 +9,26 @@ from langchain_community.embeddings import OCIGenAIEmbeddings
 class BuscaProdutoSimilar:
    def __init__(
            self,
-            faiss_index_path="faiss_index.bin",
-            id_map_path="produto_id_map.pkl",
            top_k=5,
            distancia_minima=1.0,
            model_id="cohere.embed-english-light-v3.0",
            service_endpoint="https://inference.generativeai.us-chicago-1.oci.oraclecloud.com",
-            compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
-            auth_profile="DEFAULT"
+            compartment_id="ocid1.compartment.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+            auth_profile="DEFAULT",
+            wallet_path="/WALLET_PATH/Wallet_oradb23ai",
+            db_alias="oradb23ai_high",
+            username="USER",
+            password="Password"
    ):
-        print("📦 Carregando índice vetorial...")
-        self.index = faiss.read_index(faiss_index_path)
-        with open(id_map_path, "rb") as f:
-            self.id_map = pickle.load(f)
+        os.environ["TNS_ADMIN"] = wallet_path
+        self.conn = oracledb.connect(
+            user=username,
+            password=password,
+            dsn=db_alias,
+            config_dir=wallet_path,
+            wallet_location=wallet_path,
+            wallet_password=password
+        )
        self.top_k = top_k
        self.distancia_minima = distancia_minima
        self.embedding = OCIGenAIEmbeddings(
@@ -33,8 +38,27 @@ class BuscaProdutoSimilar:
            auth_profile=auth_profile
        )

+        print("📦 Carregando vetores do Oracle...")
+        self._carregar_embeddings()
+
+    def _carregar_embeddings(self):
+        cursor = self.conn.cursor()
+        cursor.execute("SELECT id, codigo, descricao, vetor FROM embeddings_produtos")
+        self.vetores = []
+        self.produtos = []
+        for row in cursor.fetchall():
+            id_, codigo, descricao, blob = row
+            vetor = np.frombuffer(blob.read(), dtype=np.float32)
+            self.vetores.append(vetor)
+            self.produtos.append({
+                "id": id_,
+                "codigo": codigo,
+                "descricao": descricao
+            })
+        self.vetores = np.array(self.vetores)
+
    def _corrigir_input(self, input_usuario):
-        descricoes = [p["descricao"] for p in self.id_map]
+        descricoes = [p["descricao"] for p in self.produtos]
        sugestoes = difflib.get_close_matches(input_usuario, descricoes, n=1, cutoff=0.6)
        return sugestoes[0] if sugestoes else input_usuario

@@ -50,12 +74,16 @@ class BuscaProdutoSimilar:
        }

        consulta_emb = self.embedding.embed_query(descricao_corrigida)
-        consulta_emb = np.array([consulta_emb])
-        distances, indices = self.index.search(consulta_emb, self.top_k)
+        consulta_emb = np.array(consulta_emb)

-        for i, dist in zip(indices[0], distances[0]):
+        # Cálculo de distância euclidiana
+        dists = np.linalg.norm(self.vetores - consulta_emb, axis=1)
+        top_indices = np.argsort(dists)[:self.top_k]
+
+        for idx in top_indices:
+            dist = dists[idx]
            if dist < self.distancia_minima:
-                match = self.id_map[i]
+                match = self.produtos[idx]
                similaridade = 1 / (1 + dist)
                resultados["semanticos"].append({
                    "id": match["id"],
@@ -67,7 +95,7 @@ class BuscaProdutoSimilar:

        if not resultados["semanticos"]:
            melhores_fuzz = []
-            for produto in self.id_map:
+            for produto in self.produtos:
                score = fuzz.token_sort_ratio(descricao_corrigida, produto["descricao"])
                melhores_fuzz.append((produto, score))
            melhores_fuzz.sort(key=lambda x: x[1], reverse=True)