From 4868a1d458184c2adb64cf70eb835d981fee92ee Mon Sep 17 00:00:00 2001
From: Cristiano Hoshikawa <hoshikawa@uol.com.br>
Date: Mon, 21 Jul 2025 20:15:39 -0300
Subject: [PATCH] First Commit

---
 .idea/.gitignore                       |  12 ++
 .idea/misc.xml                         |   6 +
 .idea/modules.xml                      |   8 +
 .idea/oci_vision_invoice.iml           |   9 +
 .idea/vcs.xml                          |   6 +
 .oca/custom_code_review_guidelines.txt |  24 +++
 README.md                              | 221 +++++++++++++++++++++++++
 files/config                           |   8 +
 files/main.py                          | 150 +++++++++++++++++
 files/requirements.txt                 |  13 ++
 10 files changed, 457 insertions(+)
 create mode 100644 .idea/.gitignore
 create mode 100644 .idea/misc.xml
 create mode 100644 .idea/modules.xml
 create mode 100644 .idea/oci_vision_invoice.iml
 create mode 100644 .idea/vcs.xml
 create mode 100644 .oca/custom_code_review_guidelines.txt
 create mode 100644 README.md
 create mode 100644 files/config
 create mode 100644 files/main.py
 create mode 100644 files/requirements.txt
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..7d9a8e5
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,12 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Environment-dependent path to Maven home directory
+/mavenHomeManager.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Zeppelin ignored files
+/ZeppelinRemoteNotebooks/
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..f03c948
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" languageLevel="JDK_23" default="true" project-jdk-name="23" project-jdk-type="JavaSDK">
+    <output url="file://$PROJECT_DIR$/out" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..e1d27e4
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/oci_vision_invoice.iml" filepath="$PROJECT_DIR$/.idea/oci_vision_invoice.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/oci_vision_invoice.iml b/.idea/oci_vision_invoice.iml
new file mode 100644
index 0000000..d6ebd48
--- /dev/null
+++ b/.idea/oci_vision_invoice.iml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/.oca/custom_code_review_guidelines.txt b/.oca/custom_code_review_guidelines.txt
new file mode 100644
index 0000000..a0a3b63
--- /dev/null
+++ b/.oca/custom_code_review_guidelines.txt
@@ -0,0 +1,24 @@
+# Sample guideline, please follow similar structure for guideline with code samples
+# 1. Suggest using streams instead of simple loops for better readability.
+#          <example>
+#          *Comment:
+#          Category: Minor
+#          Issue: Use streams instead of a loop for better readability.
+#          Code Block:
+#
+#        ```java
+#            // Calculate squares of numbers
+#            List<Integer> squares = new ArrayList<>();
+#            for (int number : numbers) {
+#              squares.add(number * number);
+#            }
+#          ```
+#          Recommendation:
+#
+#        ```java
+#            // Calculate squares of numbers
+#            List<Integer> squares = Arrays.stream(numbers)
+#              .map(n -> n * n) // Map each number to its square
+#              .toList();
+#          ```
+#          </example>
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..7a5e80d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,221 @@
+# 📄 Automatic Invoice Processing with OCI Vision and OCI Generative AI
+
+## 🧠 Objective
+
+This tutorial demonstrates how to implement an automated pipeline that monitors a bucket in Oracle Cloud Infrastructure (OCI) for incoming invoice images, extracts textual content using **OCI Vision**, and then applies **OCI Generative AI** (LLM) to extract structured fiscal data like invoice number, customer, and item list.
+
+---
+
+## 🚀 Use Cases
+
+- Automating invoice ingestion from Object Storage.
+- Extracting structured data from semi-structured scanned documents.
+- Integrating OCR and LLM in real-time pipelines using OCI AI services.
+
+---
+
+## 🧱 Oracle Cloud Services Used
+
+| Service                     | Purpose                                                                 |
+|----------------------------|-------------------------------------------------------------------------|
+| **OCI Vision**             | Performs OCR (Optical Character Recognition) on uploaded invoice images.|
+| **OCI Generative AI**      | Extracts structured JSON data from raw OCR text using few-shot prompts. |
+| **Object Storage**         | Stores input invoice images and output JSON results.                    |
+
+---
+
+## ⚙️ Prerequisites
+
+1. An OCI account with access to:
+    - Vision AI
+    - Generative AI
+    - Object Storage
+2. A Python 3.10 at least
+3. A bucket for input images (e.g., `input-bucket`) and another for output files (e.g., `output-bucket`).
+4. A [config](./files/config) with:
+   ```json
+   {
+     "oci_profile": "DEFAULT",
+     "namespace": "your_namespace",
+     "input_bucket": "input-bucket",
+     "output_bucket": "output-bucket",
+     "compartment_id": "ocid1.compartment.oc1..xxxx",
+     "llm_endpoint": "https://inference.generativeai.us-chicago-1.oci.oraclecloud.com"
+   }
+   ```
+
+---
+
+## 🛠️ How to Run
+
+1. Execute the [requirements.txt](./files/requirements.txt) with:
+
+     
+    pip install -r requirements.txt 
+
+2. Run the Python script [main.py](./files/main.py).
+3. Upload invoice images (e.g., `.png`, `.jpg`) to your input bucket.
+4. Wait for the image to be processed and the extracted JSON saved in the output bucket.
+
+---
+
+## 🧩 Code Walkthrough
+
+### 1. Load Configuration
+
+```python
+with open("./config", "r") as f:
+    config_data = json.load(f)
+```
+
+> Loads all required configuration values such as namespace, bucket names, compartment ID, and LLM endpoint.
+
+---
+
+### 2. Initialize OCI Clients
+
+```python
+oci_config = oci.config.from_file("~/.oci/config", PROFILE)
+object_storage = oci.object_storage.ObjectStorageClient(oci_config)
+ai_vision_client = oci.ai_vision.AIServiceVisionClient(oci_config)
+```
+
+> Sets up the OCI SDK clients to access Object Storage and AI Vision services.
+
+---
+
+### 3. Initialize LLM
+
+```python
+llm = ChatOCIGenAI(
+    model_id="meta.llama-3.1-405b-instruct",
+    service_endpoint=LLM_ENDPOINT,
+    compartment_id=COMPARTMENT_ID,
+    auth_profile=PROFILE,
+    model_kwargs={"temperature": 0.7, "top_p": 0.75, "max_tokens": 2000},
+)
+```
+
+> Initializes the OCI Generative AI model for natural language understanding and text-to-structure conversion.
+
+---
+
+### 4. Few-shot Prompt
+
+```python
+few_shot_examples = [ ... ]
+instruction = """
+You are a fiscal data extractor.
+...
+"""
+```
+
+> Uses few-shot learning by providing an example of expected output so the model learns how to extract structured fields like `number of invoice`, `customer`, `location`, and `items`.
+
+---
+
+### 5. OCR with OCI Vision
+
+```python
+def perform_ocr(file_name):
+    ...
+```
+
+> This function:
+> - Sends the image to OCI Vision.
+> - Requests text detection.
+> - Returns the extracted raw text.
+
+---
+
+### 6. Data Extraction with LLM
+
+```python
+def extract_data_with_llm(ocr_text, file_name):
+    ...
+```
+
+> This function:
+> - Combines instructions + few-shot example + OCR text.
+> - Sends it to OCI Generative AI.
+> - Receives structured JSON fields (as string).
+
+---
+
+### 7. Save Output to Object Storage
+
+```python
+def save_output(result, file_name):
+    ...
+```
+
+> Uploads the structured result into the output bucket using the original filename (with `.json` extension).
+
+---
+
+### 8. Main Loop: Monitor and Process
+
+```python
+def monitor_bucket():
+    ...
+```
+
+> Main routine that:
+> - Monitors the input bucket every 30 seconds.
+> - Detects new `.png`, `.jpg`, `.jpeg` files.
+> - Runs OCR + LLM + Upload in sequence.
+> - Keeps track of already processed files in memory.
+
+---
+
+### 9. Entry Point
+
+```python
+if __name__ == "__main__":
+    monitor_bucket()
+```
+
+> Starts the bucket watcher and begins processing invoices automatically.
+
+---
+
+## ✅ Expected Output
+
+For each uploaded invoice image:
+- A corresponding `.json` file is generated with structured content like:
+
+```json
+{
+  "file": "nota123.png",
+  "result": "{ "nf": "NF102030", "customer": "Comercial ABC Ltda", ... }",
+  "timestamp": "2025-07-21T12:34:56.789Z"
+}
+```
+
+---
+
+## 🧪 Testing Suggestions
+
+- Use real or dummy invoices with legible product lines and emitente.
+- Upload multiple images in sequence to see automated processing.
+- Log into OCI Console > Object Storage to verify results in both buckets.
+
+---
+
+## 📌 Notes
+
+- OCI Vision supports Portuguese OCR (`language="POR"` can be used instead of `"ENG"`).
+- LLM prompt can be adjusted to extract other fields like `CNPJ`, `quantidade`, `data de emissão`, etc.
+- Consider persisting `processed_files` with a database or file to make the process fault-tolerant.
+
+---
+
+## 📚 References
+
+- [OCI Vision Documentation](https://docs.oracle.com/en-us/iaas/vision/)
+- [OCI Generative AI Documentation](https://docs.oracle.com/en-us/iaas/generative-ai/)
+- [LangChain OCI Integration](https://python.langchain.com/docs/integrations/chat/oci_gen_ai/)
+
+## Acknowledgments
+
+- **Author** - Cristiano Hoshikawa (Oracle LAD A-Team Solution Engineer)
diff --git a/files/config b/files/config
new file mode 100644
index 0000000..cc94e5e
--- /dev/null
+++ b/files/config
@@ -0,0 +1,8 @@
+{
+  "oci_profile": "DEFAULT",
+  "compartment_id": "<YOUR COMPARTMENT OCID>",
+  "namespace": "<YOUR NAMESPACE OCID>",
+  "input_bucket": "<YOUR INVOICES IMAGES BUCKET NAME>",
+  "output_bucket": "<YOUR OUTPUT JSON FILES BUCKET NAME>",
+  "llm_endpoint": "https://inference.generativeai.us-chicago-1.oci.oraclecloud.com"
+}
diff --git a/files/main.py b/files/main.py
new file mode 100644
index 0000000..a377c06
--- /dev/null
+++ b/files/main.py
@@ -0,0 +1,150 @@
+import time
+import json
+from pathlib import Path
+from datetime import datetime
+
+import oci
+from langchain_community.chat_models.oci_generative_ai import ChatOCIGenAI
+from langchain.schema import HumanMessage
+
+# ====================
+# 1. Load Configuration
+# ====================
+with open("./config", "r") as f:
+    config_data = json.load(f)
+
+NAMESPACE = config_data["namespace"]
+INPUT_BUCKET = config_data["input_bucket"]
+OUTPUT_BUCKET = config_data["output_bucket"]
+PROFILE = config_data["oci_profile"]
+COMPARTMENT_ID = config_data["compartment_id"]
+LLM_ENDPOINT = config_data["llm_endpoint"]
+
+# ====================
+# 2. Initialize OCI Clients
+# ====================
+oci_config = oci.config.from_file("~/.oci/config", PROFILE)
+object_storage = oci.object_storage.ObjectStorageClient(oci_config)
+ai_vision_client = oci.ai_vision.AIServiceVisionClient(oci_config)
+
+# ====================
+# 3. Initialize LLM
+# ====================
+llm = ChatOCIGenAI(
+    model_id="meta.llama-3.1-405b-instruct",
+    service_endpoint=LLM_ENDPOINT,
+    compartment_id=COMPARTMENT_ID,
+    auth_profile=PROFILE,
+    model_kwargs={"temperature": 0.7, "top_p": 0.75, "max_tokens": 2000},
+)
+
+# ====================
+# 4. Few-shot Prompt Base
+# ====================
+few_shot_examples = [
+    """
+    Invoice text: 
+        "EMITENTE": "Comercial ABC Ltda - Rua A, 123 - Belo Horizonte - MG"
+        "NF": "NF102030"
+        "DESTINATÁRIO": "Distribuidora XYZ - São Paulo - SP"
+        "DESCRIÇÃO DO PRODUTO": 
+            "Cabo HDMI 2.0 2m, preto" | PRICE: 39.90 
+            "Teclado Mecânico RGB ABNT2" | PRICE: 199.99 
+            "Mouse Gamer 3200DPI" | PRICE: 89.50 
+
+    Extracted fields (JSON format):
+        {
+          "nf": "NF102030",
+          "customer": "Comercial ABC Ltda",
+          "location": "MG",
+          "items": [
+            {"description": "Cabo HDMI 2.0 2m, preto", "price": 39.90},
+            {"description": "Teclado Mecânico RGB ABNT2", "price": 199.99},
+            {"description": "Mouse Gamer 3200DPI", "price": 89.50}
+          ]
+        }
+    """
+]
+
+instruction = """
+You are a fiscal data extractor.
+
+Your goal is to:
+- Extract the invoice number (field 'nf')
+- Extract the customer name (field 'customer')
+- Extract the state (field 'location') — ⚠️ use **only** the state of the EMITTER company, based on its name and address.
+- Extract the list of products and prices (field 'items')
+"""
+
+# ====================
+# 5. Bucket Monitoring and Processing
+# ====================
+processed_files = set()
+
+def perform_ocr(file_name):
+    print(f"📄 Performing OCR on: {file_name}")
+
+    response = ai_vision_client.analyze_document(
+        analyze_document_details=oci.ai_vision.models.AnalyzeDocumentDetails(
+            features=[
+                oci.ai_vision.models.DocumentTableDetectionFeature(
+                    feature_type="TEXT_DETECTION")],
+            document=oci.ai_vision.models.ObjectStorageDocumentDetails(
+                source="OBJECT_STORAGE",
+                namespace_name=NAMESPACE,
+                bucket_name=INPUT_BUCKET,
+                object_name=file_name),
+            compartment_id=COMPARTMENT_ID,
+            language="ENG",
+            document_type="INVOICE")
+    )
+
+    print(response.data)
+
+    return response.data
+
+def extract_data_with_llm(ocr_text, file_name):
+    prompt = instruction + "\n" + "\n".join(few_shot_examples) + f"\nInvoice text:\n{ocr_text}\nExtracted fields (JSON format):"
+    response = llm([HumanMessage(content=prompt)])
+
+    print(response.content)
+
+    return {
+        "file": file_name,
+        "result": response.content,
+        "timestamp": datetime.utcnow().isoformat()
+    }
+
+def save_output(result, file_name):
+    output_name = Path(file_name).stem + ".json"
+    object_storage.put_object(
+        namespace_name=NAMESPACE,
+        bucket_name=OUTPUT_BUCKET,
+        object_name=output_name,
+        put_object_body=json.dumps(result, ensure_ascii=False).encode("utf-8")
+    )
+    print(f"✅ Result saved as {output_name} in the output bucket.")
+
+def monitor_bucket():
+    print("📡 Monitoring input bucket...")
+    while True:
+        objects = object_storage.list_objects(
+            namespace_name=NAMESPACE,
+            bucket_name=INPUT_BUCKET
+        ).data.objects
+
+        for obj in objects:
+            file_name = obj.name
+            if file_name.endswith((".png", ".jpg", ".jpeg")) and file_name not in processed_files:
+                try:
+                    ocr_text = perform_ocr(file_name)
+                    result = extract_data_with_llm(ocr_text, file_name)
+                    save_output(result, file_name)
+                    processed_files.add(file_name)
+                except Exception as e:
+                    print(f"❌ Error processing {file_name}: {e}")
+
+        time.sleep(30)  # Wait 30 seconds before checking again
+
+if __name__ == "__main__":
+    monitor_bucket()
\ No newline at end of file
diff --git a/files/requirements.txt b/files/requirements.txt
new file mode 100644
index 0000000..1797eed
--- /dev/null
+++ b/files/requirements.txt
@@ -0,0 +1,13 @@
+langchain==0.3.23
+langchain_community~=0.3.12
+langchain_cohere
+oci-cli~=3.58.0
+langchain-core~=0.3.56
+langchain-text-splitters~=0.3.8
+ollama
+llama_index
+langgraph==0.3.25
+requests==2.32.3
+oci~=2.154.0
+setuptools~=79.0.1
+tqdm~=4.67.1