From 4868a1d458184c2adb64cf70eb835d981fee92ee Mon Sep 17 00:00:00 2001 From: Cristiano Hoshikawa Date: Mon, 21 Jul 2025 20:15:39 -0300 Subject: [PATCH] First Commit --- .idea/.gitignore | 12 ++ .idea/misc.xml | 6 + .idea/modules.xml | 8 + .idea/oci_vision_invoice.iml | 9 + .idea/vcs.xml | 6 + .oca/custom_code_review_guidelines.txt | 24 +++ README.md | 221 +++++++++++++++++++++++++ files/config | 8 + files/main.py | 150 +++++++++++++++++ files/requirements.txt | 13 ++ 10 files changed, 457 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/oci_vision_invoice.iml create mode 100644 .idea/vcs.xml create mode 100644 .oca/custom_code_review_guidelines.txt create mode 100644 README.md create mode 100644 files/config create mode 100644 files/main.py create mode 100644 files/requirements.txt diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..7d9a8e5 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,12 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Environment-dependent path to Maven home directory +/mavenHomeManager.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Zeppelin ignored files +/ZeppelinRemoteNotebooks/ diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..f03c948 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..e1d27e4 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/oci_vision_invoice.iml b/.idea/oci_vision_invoice.iml new file mode 100644 index 0000000..d6ebd48 --- /dev/null +++ b/.idea/oci_vision_invoice.iml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.oca/custom_code_review_guidelines.txt b/.oca/custom_code_review_guidelines.txt new file mode 100644 index 0000000..a0a3b63 --- /dev/null +++ b/.oca/custom_code_review_guidelines.txt @@ -0,0 +1,24 @@ +# Sample guideline, please follow similar structure for guideline with code samples +# 1. Suggest using streams instead of simple loops for better readability. +# +# *Comment: +# Category: Minor +# Issue: Use streams instead of a loop for better readability. +# Code Block: +# +# ```java +# // Calculate squares of numbers +# List squares = new ArrayList<>(); +# for (int number : numbers) { +# squares.add(number * number); +# } +# ``` +# Recommendation: +# +# ```java +# // Calculate squares of numbers +# List squares = Arrays.stream(numbers) +# .map(n -> n * n) // Map each number to its square +# .toList(); +# ``` +# diff --git a/README.md b/README.md new file mode 100644 index 0000000..7a5e80d --- /dev/null +++ b/README.md @@ -0,0 +1,221 @@ +# 📄 Automatic Invoice Processing with OCI Vision and OCI Generative AI + +## 🧠 Objective + +This tutorial demonstrates how to implement an automated pipeline that monitors a bucket in Oracle Cloud Infrastructure (OCI) for incoming invoice images, extracts textual content using **OCI Vision**, and then applies **OCI Generative AI** (LLM) to extract structured fiscal data like invoice number, customer, and item list. + +--- + +## 🚀 Use Cases + +- Automating invoice ingestion from Object Storage. +- Extracting structured data from semi-structured scanned documents. +- Integrating OCR and LLM in real-time pipelines using OCI AI services. + +--- + +## 🧱 Oracle Cloud Services Used + +| Service | Purpose | +|----------------------------|-------------------------------------------------------------------------| +| **OCI Vision** | Performs OCR (Optical Character Recognition) on uploaded invoice images.| +| **OCI Generative AI** | Extracts structured JSON data from raw OCR text using few-shot prompts. | +| **Object Storage** | Stores input invoice images and output JSON results. | + +--- + +## ⚙️ Prerequisites + +1. An OCI account with access to: + - Vision AI + - Generative AI + - Object Storage +2. A Python 3.10 at least +3. A bucket for input images (e.g., `input-bucket`) and another for output files (e.g., `output-bucket`). +4. A [config](./files/config) with: + ```json + { + "oci_profile": "DEFAULT", + "namespace": "your_namespace", + "input_bucket": "input-bucket", + "output_bucket": "output-bucket", + "compartment_id": "ocid1.compartment.oc1..xxxx", + "llm_endpoint": "https://inference.generativeai.us-chicago-1.oci.oraclecloud.com" + } + ``` + +--- + +## 🛠️ How to Run + +1. Execute the [requirements.txt](./files/requirements.txt) with: + + + pip install -r requirements.txt + +2. Run the Python script [main.py](./files/main.py). +3. Upload invoice images (e.g., `.png`, `.jpg`) to your input bucket. +4. Wait for the image to be processed and the extracted JSON saved in the output bucket. + +--- + +## 🧩 Code Walkthrough + +### 1. Load Configuration + +```python +with open("./config", "r") as f: + config_data = json.load(f) +``` + +> Loads all required configuration values such as namespace, bucket names, compartment ID, and LLM endpoint. + +--- + +### 2. Initialize OCI Clients + +```python +oci_config = oci.config.from_file("~/.oci/config", PROFILE) +object_storage = oci.object_storage.ObjectStorageClient(oci_config) +ai_vision_client = oci.ai_vision.AIServiceVisionClient(oci_config) +``` + +> Sets up the OCI SDK clients to access Object Storage and AI Vision services. + +--- + +### 3. Initialize LLM + +```python +llm = ChatOCIGenAI( + model_id="meta.llama-3.1-405b-instruct", + service_endpoint=LLM_ENDPOINT, + compartment_id=COMPARTMENT_ID, + auth_profile=PROFILE, + model_kwargs={"temperature": 0.7, "top_p": 0.75, "max_tokens": 2000}, +) +``` + +> Initializes the OCI Generative AI model for natural language understanding and text-to-structure conversion. + +--- + +### 4. Few-shot Prompt + +```python +few_shot_examples = [ ... ] +instruction = """ +You are a fiscal data extractor. +... +""" +``` + +> Uses few-shot learning by providing an example of expected output so the model learns how to extract structured fields like `number of invoice`, `customer`, `location`, and `items`. + +--- + +### 5. OCR with OCI Vision + +```python +def perform_ocr(file_name): + ... +``` + +> This function: +> - Sends the image to OCI Vision. +> - Requests text detection. +> - Returns the extracted raw text. + +--- + +### 6. Data Extraction with LLM + +```python +def extract_data_with_llm(ocr_text, file_name): + ... +``` + +> This function: +> - Combines instructions + few-shot example + OCR text. +> - Sends it to OCI Generative AI. +> - Receives structured JSON fields (as string). + +--- + +### 7. Save Output to Object Storage + +```python +def save_output(result, file_name): + ... +``` + +> Uploads the structured result into the output bucket using the original filename (with `.json` extension). + +--- + +### 8. Main Loop: Monitor and Process + +```python +def monitor_bucket(): + ... +``` + +> Main routine that: +> - Monitors the input bucket every 30 seconds. +> - Detects new `.png`, `.jpg`, `.jpeg` files. +> - Runs OCR + LLM + Upload in sequence. +> - Keeps track of already processed files in memory. + +--- + +### 9. Entry Point + +```python +if __name__ == "__main__": + monitor_bucket() +``` + +> Starts the bucket watcher and begins processing invoices automatically. + +--- + +## ✅ Expected Output + +For each uploaded invoice image: +- A corresponding `.json` file is generated with structured content like: + +```json +{ + "file": "nota123.png", + "result": "{ "nf": "NF102030", "customer": "Comercial ABC Ltda", ... }", + "timestamp": "2025-07-21T12:34:56.789Z" +} +``` + +--- + +## 🧪 Testing Suggestions + +- Use real or dummy invoices with legible product lines and emitente. +- Upload multiple images in sequence to see automated processing. +- Log into OCI Console > Object Storage to verify results in both buckets. + +--- + +## 📌 Notes + +- OCI Vision supports Portuguese OCR (`language="POR"` can be used instead of `"ENG"`). +- LLM prompt can be adjusted to extract other fields like `CNPJ`, `quantidade`, `data de emissão`, etc. +- Consider persisting `processed_files` with a database or file to make the process fault-tolerant. + +--- + +## 📚 References + +- [OCI Vision Documentation](https://docs.oracle.com/en-us/iaas/vision/) +- [OCI Generative AI Documentation](https://docs.oracle.com/en-us/iaas/generative-ai/) +- [LangChain OCI Integration](https://python.langchain.com/docs/integrations/chat/oci_gen_ai/) + +## Acknowledgments + +- **Author** - Cristiano Hoshikawa (Oracle LAD A-Team Solution Engineer) diff --git a/files/config b/files/config new file mode 100644 index 0000000..cc94e5e --- /dev/null +++ b/files/config @@ -0,0 +1,8 @@ +{ + "oci_profile": "DEFAULT", + "compartment_id": "", + "namespace": "", + "input_bucket": "", + "output_bucket": "", + "llm_endpoint": "https://inference.generativeai.us-chicago-1.oci.oraclecloud.com" +} diff --git a/files/main.py b/files/main.py new file mode 100644 index 0000000..a377c06 --- /dev/null +++ b/files/main.py @@ -0,0 +1,150 @@ +import time +import json +from pathlib import Path +from datetime import datetime + +import oci +from langchain_community.chat_models.oci_generative_ai import ChatOCIGenAI +from langchain.schema import HumanMessage + +# ==================== +# 1. Load Configuration +# ==================== +with open("./config", "r") as f: + config_data = json.load(f) + +NAMESPACE = config_data["namespace"] +INPUT_BUCKET = config_data["input_bucket"] +OUTPUT_BUCKET = config_data["output_bucket"] +PROFILE = config_data["oci_profile"] +COMPARTMENT_ID = config_data["compartment_id"] +LLM_ENDPOINT = config_data["llm_endpoint"] + +# ==================== +# 2. Initialize OCI Clients +# ==================== +oci_config = oci.config.from_file("~/.oci/config", PROFILE) +object_storage = oci.object_storage.ObjectStorageClient(oci_config) +ai_vision_client = oci.ai_vision.AIServiceVisionClient(oci_config) + +# ==================== +# 3. Initialize LLM +# ==================== +llm = ChatOCIGenAI( + model_id="meta.llama-3.1-405b-instruct", + service_endpoint=LLM_ENDPOINT, + compartment_id=COMPARTMENT_ID, + auth_profile=PROFILE, + model_kwargs={"temperature": 0.7, "top_p": 0.75, "max_tokens": 2000}, +) + +# ==================== +# 4. Few-shot Prompt Base +# ==================== +few_shot_examples = [ + """ + Invoice text: + "EMITENTE": "Comercial ABC Ltda - Rua A, 123 - Belo Horizonte - MG" + "NF": "NF102030" + "DESTINATÁRIO": "Distribuidora XYZ - São Paulo - SP" + "DESCRIÇÃO DO PRODUTO": + "Cabo HDMI 2.0 2m, preto" | PRICE: 39.90 + "Teclado Mecânico RGB ABNT2" | PRICE: 199.99 + "Mouse Gamer 3200DPI" | PRICE: 89.50 + + Extracted fields (JSON format): + { + "nf": "NF102030", + "customer": "Comercial ABC Ltda", + "location": "MG", + "items": [ + {"description": "Cabo HDMI 2.0 2m, preto", "price": 39.90}, + {"description": "Teclado Mecânico RGB ABNT2", "price": 199.99}, + {"description": "Mouse Gamer 3200DPI", "price": 89.50} + ] + } + """ +] + +instruction = """ +You are a fiscal data extractor. + +Your goal is to: +- Extract the invoice number (field 'nf') +- Extract the customer name (field 'customer') +- Extract the state (field 'location') — ⚠️ use **only** the state of the EMITTER company, based on its name and address. +- Extract the list of products and prices (field 'items') +""" + +# ==================== +# 5. Bucket Monitoring and Processing +# ==================== +processed_files = set() + +def perform_ocr(file_name): + print(f"📄 Performing OCR on: {file_name}") + + response = ai_vision_client.analyze_document( + analyze_document_details=oci.ai_vision.models.AnalyzeDocumentDetails( + features=[ + oci.ai_vision.models.DocumentTableDetectionFeature( + feature_type="TEXT_DETECTION")], + document=oci.ai_vision.models.ObjectStorageDocumentDetails( + source="OBJECT_STORAGE", + namespace_name=NAMESPACE, + bucket_name=INPUT_BUCKET, + object_name=file_name), + compartment_id=COMPARTMENT_ID, + language="ENG", + document_type="INVOICE") + ) + + print(response.data) + + return response.data + +def extract_data_with_llm(ocr_text, file_name): + prompt = instruction + "\n" + "\n".join(few_shot_examples) + f"\nInvoice text:\n{ocr_text}\nExtracted fields (JSON format):" + response = llm([HumanMessage(content=prompt)]) + + print(response.content) + + return { + "file": file_name, + "result": response.content, + "timestamp": datetime.utcnow().isoformat() + } + +def save_output(result, file_name): + output_name = Path(file_name).stem + ".json" + object_storage.put_object( + namespace_name=NAMESPACE, + bucket_name=OUTPUT_BUCKET, + object_name=output_name, + put_object_body=json.dumps(result, ensure_ascii=False).encode("utf-8") + ) + print(f"✅ Result saved as {output_name} in the output bucket.") + +def monitor_bucket(): + print("📡 Monitoring input bucket...") + while True: + objects = object_storage.list_objects( + namespace_name=NAMESPACE, + bucket_name=INPUT_BUCKET + ).data.objects + + for obj in objects: + file_name = obj.name + if file_name.endswith((".png", ".jpg", ".jpeg")) and file_name not in processed_files: + try: + ocr_text = perform_ocr(file_name) + result = extract_data_with_llm(ocr_text, file_name) + save_output(result, file_name) + processed_files.add(file_name) + except Exception as e: + print(f"❌ Error processing {file_name}: {e}") + + time.sleep(30) # Wait 30 seconds before checking again + +if __name__ == "__main__": + monitor_bucket() \ No newline at end of file diff --git a/files/requirements.txt b/files/requirements.txt new file mode 100644 index 0000000..1797eed --- /dev/null +++ b/files/requirements.txt @@ -0,0 +1,13 @@ +langchain==0.3.23 +langchain_community~=0.3.12 +langchain_cohere +oci-cli~=3.58.0 +langchain-core~=0.3.56 +langchain-text-splitters~=0.3.8 +ollama +llama_index +langgraph==0.3.25 +requests==2.32.3 +oci~=2.154.0 +setuptools~=79.0.1 +tqdm~=4.67.1