Link Search Menu Expand Document

Parse PDF Invoice - Python

Document Parser sample in Python demonstrating ‘Parse PDF Invoice’

GenericInvoice-Template.json
{
  "templateName": "Generic Invoice [en]",
  "templateVersion": 4,
  "templatePriority": 999999,
  "objects": [
    {
      "name": "companyName",
      "objectType": "field",
      "fieldProperties": {
        "fieldType": "macros",
        "expression": "$$funcFindCompany",
        "regex": true
      },
      "id": 0
    },
    {
      "name": "companyName2",
      "objectType": "field",
      "fieldProperties": {
        "fieldType": "macros",
        "expression": "$$funcFindCompanyNext",
        "regex": true
      },
      "id": 1
    },
    {
      "name": "invoiceId",
      "objectType": "field",
      "fieldProperties": {
        "fieldType": "macros",
        "expression": "(?is)(?:\\bInvoice|\\bNumber).*?(?<value>(?-i)INV-[A-Z0-9]+)( |\\r?$)",
        "regex": true
      },
      "id": 2
    },
    {
      "name": "invoiceIdFallback",
      "objectType": "field",
      "fieldProperties": {
        "fieldType": "macros",
        "expression": "(?i)(?:Invoice Number|Invoice #|Invoice No\\.?|Receipt No\\.?|CREDIT NOTE No\\.?|REF NO\\.?)\\s*:?\\s+(?<value>.*?)(  |\\r?$)",
        "regex": true,
        "coalesceWith": "invoiceId"
      },
      "id": 3
    },
    {
      "name": "invoiceIdFallback2",
      "objectType": "field",
      "fieldProperties": {
        "fieldType": "macros",
        "expression": "(?i)(?:\\bInvoice|\\bNumber)\\s*:?\\s+(?<value>.*?)(  |\\r?$)",
        "regex": true,
        "coalesceWith": "invoiceId"
      },
      "id": 4
    },
    {
      "name": "dateIssued",
      "objectType": "field",
      "fieldProperties": {
        "fieldType": "macros",
        "expression": "(?is)(?:Invoice Date|Issue Date|Date Issued|Date of Issue|Billed On).*?(?<value>{{SmartDate}})",
        "regex": true,
        "dataType": "date"
      },
      "id": 5
    },
    {
      "name": "dateIssuedFallback",
      "objectType": "field",
      "fieldProperties": {
        "fieldType": "macros",
        "expression": "$$funcFindMinDate",
        "regex": true,
        "dataType": "date",
        "coalesceWith": "dateIssued"
      },
      "id": 6
    },
    {
      "name": "dateDue",
      "objectType": "field",
      "fieldProperties": {
        "fieldType": "macros",
        "expression": "(?is)(?:Due Date|Due On|Payment Due).*?(?<value>{{SmartDate}})",
        "regex": true,
        "dataType": "date"
      },
      "id": 7
    },
    {
      "name": "dateDueFallback",
      "objectType": "field",
      "fieldProperties": {
        "fieldType": "macros",
        "expression": "$$funcFindMaxDate",
        "regex": true,
        "dataType": "date",
        "coalesceWith": "dateDue"
      },
      "id": 8
    },
    {
      "name": "bankAccount",
      "objectType": "field",
      "fieldProperties": {
        "fieldType": "macros",
        "expression": "(?i)(?:Account Number|Account #|Bank Account No\\.?|ACCT #|Checking Number)\\s*:?\\s*(?<value>[0-9\\p{Pd}]{4,20})(\\s|\\r?$)",
        "regex": true
      },
      "id": 9
    },
    {
      "name": "bankAccountFallback",
      "objectType": "field",
      "fieldProperties": {
        "fieldType": "macros",
        "expression": "(?is)(?:Account).*?(?<value>[0-9\\p{Pd}]{5,20})(\\s|\\r?$)",
        "regex": true,
        "coalesceWith": "bankAccount"
      },
      "id": 10
    },
    {
      "name": "total",
      "objectType": "field",
      "fieldProperties": {
        "fieldType": "macros",
        "expression": "(?i)(?:Invoice Total|Total Due|Total Amount|Amount Due|Amount Payable|Total Invoice|Gross Amount|Total for this invoice|Total \\(Incl\\. VAT\\)).*?(?<value>{{Number}})(\\s|$)",
        "regex": true,
        "dataType": "decimal"
      },
      "id": 11
    },
    {
      "name": "totalFallback",
      "objectType": "field",
      "fieldProperties": {
        "fieldType": "macros",
        "expression": "$$funcFindMaxNumber",
        "regex": true,
        "dataType": "decimal",
        "coalesceWith": "total"
      },
      "id": 12
    },
    {
      "name": "subTotal",
      "objectType": "field",
      "fieldProperties": {
        "fieldType": "macros",
        "expression": "(?i)(?:Subtotal|Sub Total|Invoice Sub-total|Sub-total|Taxable Value|Net Amount|Total \\(Excl\\. VAT\\)).*?(?<value>{{Number}})(\\s|$)",
        "regex": true,
        "dataType": "decimal"
      },
      "id": 13
    },
    {
      "name": "tax",
      "objectType": "field",
      "fieldProperties": {
        "fieldType": "macros",
        "expression": "(?i)(?:Total Tax|Tax Amount|Sales Tax|Total GST).*?(?<value>{{Number}})(\\s|$)",
        "regex": true,
        "dataType": "decimal"
      },
      "id": 14
    },
    {
      "name": "taxFallback",
      "objectType": "field",
      "fieldProperties": {
        "fieldType": "macros",
        "expression": "(?i)(?:\\bVAT|\\bTax|\\bGST).*?(?<value>{{Number}})(\\s|$)",
        "regex": true,
        "dataType": "decimal",
        "coalesceWith": "tax"
      },
      "id": 15
    },
    {
      "name": "table",
      "objectType": "table",
      "tableProperties": {
        "start": {
          "pageIndex": 0,
          "expression": "(?i)^.*?(Description|Quantity|Qty|Unit Price).*?$",
          "regex": true
        },
        "end": {
          "expression": "(?i)^.*?(Subtotal|Total|Amount).*?$",
          "regex": true
        }
      },
      "id": 16
    }
  ],
  "culture": "en-US",
  "description": "",
  "options": {
    "ocrMode": "auto",
    "ocrLanguage": "eng",
    "ocrResolution": 300,
    "ocrImageFilters": "",
    "ocrWhiteList": "",
    "ocrBlackList": ""
  }
}
parsePdfInvoice.py
import os
import requests # pip install requests

# The authentication key (API Key).
# Get your own by registering at https://app.pdf.co
API_KEY = "***********************************************"

# Base URL for PDF.co Web API requests
BASE_URL = "https://api.pdf.co/v1"

# Source PDF file
SourceFile = ".\\SampleInvoice.pdf"

# Destination JSON file name
DestinationFile = ".\\result.json"

# Template text. Use Document Parser (https://pdf.co/document-parser, https://app.pdf.co/document-parser)
# to create templates.
# Read template from file:
file_read = open(".\\GenericInvoice-Template.json", mode='r', encoding="utf-8",errors="ignore")
Template = file_read.read()
file_read.close()

def main(args = None):
    uploadedFileUrl = uploadFile(SourceFile)

    if (uploadedFileUrl != None):
        PerformDocumentParser(uploadedFileUrl, Template, DestinationFile)

def PerformDocumentParser(uploadedFileUrl, template, destinationFile):

    # Content
    data = {
        'url': uploadedFileUrl,
        'template': template
    }

    # Prepare URL for 'Document Parser' API request
    url = "{}/pdf/documentparser".format(BASE_URL)

    # Execute request and get response as JSON
    response = requests.post(url, data= data, headers={ "x-api-key": API_KEY })

    if (response.status_code == 200):
        json = response.json()

        if json["error"] == False:
            #  Get URL of result file
            resultFileUrl = json["url"]            
            # Download result file
            r = requests.get(resultFileUrl, stream=True)
            if (r.status_code == 200):
                with open(destinationFile, 'wb') as file:
                    for chunk in r:
                        file.write(chunk)
                print(f"Result file saved as \"{destinationFile}\" file.")
            else:
                print(f"Request error: {response.status_code} {response.reason}")
        else:
            # Show service reported error
            print(json["message"])
    else:
        print(f"Request error: {response.status_code} {response.reason}")


def uploadFile(fileName):
    """Uploads file to the cloud"""

    # 1. RETRIEVE PRESIGNED URL TO UPLOAD FILE.

    # Prepare URL for 'Get Presigned URL' API request
    url = "{}/file/upload/get-presigned-url?contenttype=application/octet-stream&name={}".format(
        BASE_URL, os.path.basename(fileName))

    # Execute request and get response as JSON
    response = requests.get(url, headers={"x-api-key": API_KEY})
    if (response.status_code == 200):
        json = response.json()

        if json["error"] == False:
            # URL to use for file upload
            uploadUrl = json["presignedUrl"]
            # URL for future reference
            uploadedFileUrl = json["url"]

            # 2. UPLOAD FILE TO CLOUD.
            with open(fileName, 'rb') as file:
                requests.put(uploadUrl, data=file,
                             headers={"x-api-key": API_KEY, "content-type": "application/octet-stream"})

            return uploadedFileUrl
        else:
            # Show service reported error
            print(json["message"])
    else:
        print(f"Request error: {response.status_code} {response.reason}")

    return None


if __name__ == '__main__':
    main()

PDF.co Web API: the Web API with a set of tools for documents manipulation, data conversion, data extraction, splitting and merging of documents. Includes image recognition, built-in OCR, barcode generation and barcode decoders to decode bar codes from scans, pictures and pdf.

Get your PDF.co API key here!

Download Source Code (.zip)

return to the previous page explore Document Parser endpoint