First Commit

2026-03-03 16:09:39 +00:00 · 2025-07-21 20:15:39 -03:00
commit 4868a1d458
10 changed files with 457 additions and 0 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@@ -0,0 +1,12 @@
 # Default ignored files
 /shelf/
 /workspace.xml
 # Editor-based HTTP Client requests
 /httpRequests/
 # Environment-dependent path to Maven home directory
 /mavenHomeManager.xml
 # Datasource local storage ignored files
 /dataSources/
 /dataSources.local.xml
 # Zeppelin ignored files
 /ZeppelinRemoteNotebooks/
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -0,0 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectRootManager" version="2" languageLevel="JDK_23" default="true" project-jdk-name="23" project-jdk-type="JavaSDK">
    <output url="file://$PROJECT_DIR$/out" />
  </component>
 </project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/.idea/oci_vision_invoice.iml" filepath="$PROJECT_DIR$/.idea/oci_vision_invoice.iml" />
    </modules>
  </component>
 </project>
--- a/.idea/oci_vision_invoice.iml
+++ b/.idea/oci_vision_invoice.iml
@@ -0,0 +1,9 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="JAVA_MODULE" version="4">
  <component name="NewModuleRootManager" inherit-compiler-output="true">
    <exclude-output />
    <content url="file://$MODULE_DIR$" />
    <orderEntry type="inheritedJdk" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="VcsDirectoryMappings">
    <mapping directory="" vcs="Git" />
  </component>
 </project>
--- a/.oca/custom_code_review_guidelines.txt
+++ b/.oca/custom_code_review_guidelines.txt
@@ -0,0 +1,24 @@
 # Sample guideline, please follow similar structure for guideline with code samples
 # 1. Suggest using streams instead of simple loops for better readability.
 #          <example>
 #          *Comment:
 #          Category: Minor
 #          Issue: Use streams instead of a loop for better readability.
 #          Code Block:
 #
 #        ```java
 #            // Calculate squares of numbers
 #            List<Integer> squares = new ArrayList<>();
 #            for (int number : numbers) {
 #              squares.add(number * number);
 #            }
 #          ```
 #          Recommendation:
 #
 #        ```java
 #            // Calculate squares of numbers
 #            List<Integer> squares = Arrays.stream(numbers)
 #              .map(n -> n * n) // Map each number to its square
 #              .toList();
 #          ```
 #          </example>
--- a/README.md
+++ b/README.md
@@ -0,0 +1,221 @@
 # 📄 Automatic Invoice Processing with OCI Vision and OCI Generative AI
 ## 🧠 Objective
 This tutorial demonstrates how to implement an automated pipeline that monitors a bucket in Oracle Cloud Infrastructure (OCI) for incoming invoice images, extracts textual content using **OCI Vision**, and then applies **OCI Generative AI** (LLM) to extract structured fiscal data like invoice number, customer, and item list.
 ---
 ## 🚀 Use Cases
 - Automating invoice ingestion from Object Storage.
 - Extracting structured data from semi-structured scanned documents.
 - Integrating OCR and LLM in real-time pipelines using OCI AI services.
 ---
 ## 🧱 Oracle Cloud Services Used
 | Service                     | Purpose                                                                 |
 |----------------------------|-------------------------------------------------------------------------|
 | **OCI Vision**             | Performs OCR (Optical Character Recognition) on uploaded invoice images.|
 | **OCI Generative AI**      | Extracts structured JSON data from raw OCR text using few-shot prompts. |
 | **Object Storage**         | Stores input invoice images and output JSON results.                    |
 ---
 ## ⚙️ Prerequisites
 1. An OCI account with access to:
    - Vision AI
    - Generative AI
    - Object Storage
 2. A Python 3.10 at least
 3. A bucket for input images (e.g., `input-bucket`) and another for output files (e.g., `output-bucket`).
 4. A [config](./files/config) with:
   ```json
   {
     "oci_profile": "DEFAULT",
     "namespace": "your_namespace",
     "input_bucket": "input-bucket",
     "output_bucket": "output-bucket",
     "compartment_id": "ocid1.compartment.oc1..xxxx",
     "llm_endpoint": "https://inference.generativeai.us-chicago-1.oci.oraclecloud.com"
   }
   ```
 ---
 ## 🛠️ How to Run
 1. Execute the [requirements.txt](./files/requirements.txt) with:
    pip install -r requirements.txt 
 2. Run the Python script [main.py](./files/main.py).
 3. Upload invoice images (e.g., `.png`, `.jpg`) to your input bucket.
 4. Wait for the image to be processed and the extracted JSON saved in the output bucket.
 ---
 ## 🧩 Code Walkthrough
 ### 1. Load Configuration
 ```python
 with open("./config", "r") as f:
    config_data = json.load(f)
 ```
 > Loads all required configuration values such as namespace, bucket names, compartment ID, and LLM endpoint.
 ---
 ### 2. Initialize OCI Clients
 ```python
 oci_config = oci.config.from_file("~/.oci/config", PROFILE)
 object_storage = oci.object_storage.ObjectStorageClient(oci_config)
 ai_vision_client = oci.ai_vision.AIServiceVisionClient(oci_config)
 ```
 > Sets up the OCI SDK clients to access Object Storage and AI Vision services.
 ---
 ### 3. Initialize LLM
 ```python
 llm = ChatOCIGenAI(
    model_id="meta.llama-3.1-405b-instruct",
    service_endpoint=LLM_ENDPOINT,
    compartment_id=COMPARTMENT_ID,
    auth_profile=PROFILE,
    model_kwargs={"temperature": 0.7, "top_p": 0.75, "max_tokens": 2000},
 )
 ```
 > Initializes the OCI Generative AI model for natural language understanding and text-to-structure conversion.
 ---
 ### 4. Few-shot Prompt
 ```python
 few_shot_examples = [ ... ]
 instruction = """
 You are a fiscal data extractor.
 ...
 """
 ```
 > Uses few-shot learning by providing an example of expected output so the model learns how to extract structured fields like `number of invoice`, `customer`, `location`, and `items`.
 ---
 ### 5. OCR with OCI Vision
 ```python
 def perform_ocr(file_name):
    ...
 ```
 > This function:
 > - Sends the image to OCI Vision.
 > - Requests text detection.
 > - Returns the extracted raw text.
 ---
 ### 6. Data Extraction with LLM
 ```python
 def extract_data_with_llm(ocr_text, file_name):
    ...
 ```
 > This function:
 > - Combines instructions + few-shot example + OCR text.
 > - Sends it to OCI Generative AI.
 > - Receives structured JSON fields (as string).
 ---
 ### 7. Save Output to Object Storage
 ```python
 def save_output(result, file_name):
    ...
 ```
 > Uploads the structured result into the output bucket using the original filename (with `.json` extension).
 ---
 ### 8. Main Loop: Monitor and Process
 ```python
 def monitor_bucket():
    ...
 ```
 > Main routine that:
 > - Monitors the input bucket every 30 seconds.
 > - Detects new `.png`, `.jpg`, `.jpeg` files.
 > - Runs OCR + LLM + Upload in sequence.
 > - Keeps track of already processed files in memory.
 ---
 ### 9. Entry Point
 ```python
 if __name__ == "__main__":
    monitor_bucket()
 ```
 > Starts the bucket watcher and begins processing invoices automatically.
 ---
 ## ✅ Expected Output
 For each uploaded invoice image:
 - A corresponding `.json` file is generated with structured content like:
 ```json
 {
  "file": "nota123.png",
  "result": "{ "nf": "NF102030", "customer": "Comercial ABC Ltda", ... }",
  "timestamp": "2025-07-21T12:34:56.789Z"
 }
 ```
 ---
 ## 🧪 Testing Suggestions
 - Use real or dummy invoices with legible product lines and emitente.
 - Upload multiple images in sequence to see automated processing.
 - Log into OCI Console > Object Storage to verify results in both buckets.
 ---
 ## 📌 Notes
 - OCI Vision supports Portuguese OCR (`language="POR"` can be used instead of `"ENG"`).
 - LLM prompt can be adjusted to extract other fields like `CNPJ`, `quantidade`, `data de emissão`, etc.
 - Consider persisting `processed_files` with a database or file to make the process fault-tolerant.
 ---
 ## 📚 References
 - [OCI Vision Documentation](https://docs.oracle.com/en-us/iaas/vision/)
 - [OCI Generative AI Documentation](https://docs.oracle.com/en-us/iaas/generative-ai/)
 - [LangChain OCI Integration](https://python.langchain.com/docs/integrations/chat/oci_gen_ai/)
 ## Acknowledgments
 - **Author** - Cristiano Hoshikawa (Oracle LAD A-Team Solution Engineer)
--- a/files/config
+++ b/files/config
@@ -0,0 +1,8 @@
 {
  "oci_profile": "DEFAULT",
  "compartment_id": "<YOUR COMPARTMENT OCID>",
  "namespace": "<YOUR NAMESPACE OCID>",
  "input_bucket": "<YOUR INVOICES IMAGES BUCKET NAME>",
  "output_bucket": "<YOUR OUTPUT JSON FILES BUCKET NAME>",
  "llm_endpoint": "https://inference.generativeai.us-chicago-1.oci.oraclecloud.com"
 }
--- a/files/main.py
+++ b/files/main.py
@@ -0,0 +1,150 @@
 import time
 import json
 from pathlib import Path
 from datetime import datetime
 import oci
 from langchain_community.chat_models.oci_generative_ai import ChatOCIGenAI
 from langchain.schema import HumanMessage
 # ====================
 # 1. Load Configuration
 # ====================
 with open("./config", "r") as f:
    config_data = json.load(f)
 NAMESPACE = config_data["namespace"]
 INPUT_BUCKET = config_data["input_bucket"]
 OUTPUT_BUCKET = config_data["output_bucket"]
 PROFILE = config_data["oci_profile"]
 COMPARTMENT_ID = config_data["compartment_id"]
 LLM_ENDPOINT = config_data["llm_endpoint"]
 # ====================
 # 2. Initialize OCI Clients
 # ====================
 oci_config = oci.config.from_file("~/.oci/config", PROFILE)
 object_storage = oci.object_storage.ObjectStorageClient(oci_config)
 ai_vision_client = oci.ai_vision.AIServiceVisionClient(oci_config)
 # ====================
 # 3. Initialize LLM
 # ====================
 llm = ChatOCIGenAI(
    model_id="meta.llama-3.1-405b-instruct",
    service_endpoint=LLM_ENDPOINT,
    compartment_id=COMPARTMENT_ID,
    auth_profile=PROFILE,
    model_kwargs={"temperature": 0.7, "top_p": 0.75, "max_tokens": 2000},
 )
 # ====================
 # 4. Few-shot Prompt Base
 # ====================
 few_shot_examples = [
    """
    Invoice text: 
        "EMITENTE": "Comercial ABC Ltda - Rua A, 123 - Belo Horizonte - MG"
        "NF": "NF102030"
        "DESTINATÁRIO": "Distribuidora XYZ - São Paulo - SP"
        "DESCRIÇÃO DO PRODUTO": 
            "Cabo HDMI 2.0 2m, preto" | PRICE: 39.90 
            "Teclado Mecânico RGB ABNT2" | PRICE: 199.99 
            "Mouse Gamer 3200DPI" | PRICE: 89.50 
    Extracted fields (JSON format):
        {
          "nf": "NF102030",
          "customer": "Comercial ABC Ltda",
          "location": "MG",
          "items": [
            {"description": "Cabo HDMI 2.0 2m, preto", "price": 39.90},
            {"description": "Teclado Mecânico RGB ABNT2", "price": 199.99},
            {"description": "Mouse Gamer 3200DPI", "price": 89.50}
          ]
        }
    """
 ]
 instruction = """
 You are a fiscal data extractor.
 Your goal is to:
 - Extract the invoice number (field 'nf')
 - Extract the customer name (field 'customer')
 - Extract the state (field 'location') — ⚠️ use **only** the state of the EMITTER company, based on its name and address.
 - Extract the list of products and prices (field 'items')
 """
 # ====================
 # 5. Bucket Monitoring and Processing
 # ====================
 processed_files = set()
 def perform_ocr(file_name):
    print(f"📄 Performing OCR on: {file_name}")
    response = ai_vision_client.analyze_document(
        analyze_document_details=oci.ai_vision.models.AnalyzeDocumentDetails(
            features=[
                oci.ai_vision.models.DocumentTableDetectionFeature(
                    feature_type="TEXT_DETECTION")],
            document=oci.ai_vision.models.ObjectStorageDocumentDetails(
                source="OBJECT_STORAGE",
                namespace_name=NAMESPACE,
                bucket_name=INPUT_BUCKET,
                object_name=file_name),
            compartment_id=COMPARTMENT_ID,
            language="ENG",
            document_type="INVOICE")
    )
    print(response.data)
    return response.data
 def extract_data_with_llm(ocr_text, file_name):
    prompt = instruction + "\n" + "\n".join(few_shot_examples) + f"\nInvoice text:\n{ocr_text}\nExtracted fields (JSON format):"
    response = llm([HumanMessage(content=prompt)])
    print(response.content)
    return {
        "file": file_name,
        "result": response.content,
        "timestamp": datetime.utcnow().isoformat()
    }
 def save_output(result, file_name):
    output_name = Path(file_name).stem + ".json"
    object_storage.put_object(
        namespace_name=NAMESPACE,
        bucket_name=OUTPUT_BUCKET,
        object_name=output_name,
        put_object_body=json.dumps(result, ensure_ascii=False).encode("utf-8")
    )
    print(f"✅ Result saved as {output_name} in the output bucket.")
 def monitor_bucket():
    print("📡 Monitoring input bucket...")
    while True:
        objects = object_storage.list_objects(
            namespace_name=NAMESPACE,
            bucket_name=INPUT_BUCKET
        ).data.objects
        for obj in objects:
            file_name = obj.name
            if file_name.endswith((".png", ".jpg", ".jpeg")) and file_name not in processed_files:
                try:
                    ocr_text = perform_ocr(file_name)
                    result = extract_data_with_llm(ocr_text, file_name)
                    save_output(result, file_name)
                    processed_files.add(file_name)
                except Exception as e:
                    print(f"❌ Error processing {file_name}: {e}")
        time.sleep(30)  # Wait 30 seconds before checking again
 if __name__ == "__main__":
    monitor_bucket()
--- a/files/requirements.txt
+++ b/files/requirements.txt
@@ -0,0 +1,13 @@
 langchain==0.3.23
 langchain_community~=0.3.12
 langchain_cohere
 oci-cli~=3.58.0
 langchain-core~=0.3.56
 langchain-text-splitters~=0.3.8
 ollama
 llama_index
 langgraph==0.3.25
 requests==2.32.3
 oci~=2.154.0
 setuptools~=79.0.1
 tqdm~=4.67.1