Parse Simple Document - VB.NET
Document Parser sample in VB.NET demonstrating ‘Parse Simple Document’
AmazonAWS.yml
templateName: Amazon Web Services Invoice
templateVersion: 4
templatePriority: 0
detectionRules:
keywords:
- Amazon Web Services
- ATTN
- Invoice
objects:
- name: total
objectType: field
fieldProperties:
fieldType: macros
expression: TOTAL AMOUNT DUE ON{{Anything}}{{Dollar}}({{Number}})
regex: true
dataType: decimal
- name: subTotal
objectType: field
fieldProperties:
fieldType: macros
expression: '{{LineStart}}{{Spaces}}Charges{{Spaces}}{{Dollar}}({{Number}})'
regex: true
dataType: decimal
- name: dateIssued
objectType: field
fieldProperties:
fieldType: macros
expression: Invoice Date:{{Spaces}}({{Anything}}){{LineEnd}}
regex: true
dataType: date
dateFormat: MMMM d , yyyy
- name: invoiceId
objectType: field
fieldProperties:
fieldType: macros
expression: Invoice Number:{{Spaces}}({{Digits}})
regex: true
- name: companyName
objectType: field
fieldProperties:
fieldType: static
expression: Amazon Web Services, Inc.
regex: true
- name: companyWebsite
objectType: field
fieldProperties:
fieldType: static
expression: aws.amazon.com
regex: true
- name: billTo
objectType: field
fieldProperties:
fieldType: rectangle
expression: Bill to Address:{{ToggleSingleLineMode}}({{AnythingGreedy}})
regex: true
rectangle:
- 33
- 115.5
- 213.75
- 72.75
pageIndex: 0
- name: currency
objectType: field
fieldProperties:
fieldType: static
expression: USD
regex: true
- name: table1
objectType: table
tableProperties:
start:
expression: '{{LineStart}}{{Spaces}}Detail{{LineEnd}}'
regex: true
end:
expression: '{{EndOfPage}}'
regex: true
row:
expression: '{{LineStart}}{{Spaces}}(?<description>{{SentenceWithSingleSpaces}}){{Spaces}}{{Dollar}}(?<unitPrice>{{Number}}){{LineEnd}}'
regex: true
columns:
- name: unitPrice
dataType: decimal
DigitalOcean.yml
templateName: DigitalOcean Invoice
templateVersion: 4
templatePriority: 0
detectionRules:
keywords:
- DigitalOcean
- 101 Avenue of the Americas
- Invoice Number
objects:
- name: companyName
objectType: field
fieldProperties:
fieldType: static
expression: DigitalOcean
regex: true
- name: invoiceId
objectType: field
fieldProperties:
fieldType: macros
expression: 'Invoice Number: ({{Digits}})'
regex: true
- name: dateIssued
objectType: field
fieldProperties:
fieldType: macros
expression: 'Date Issued: ({{SmartDate}})'
regex: true
dataType: date
dateFormat: auto-mdy
- name: total
objectType: field
fieldProperties:
fieldType: macros
expression: 'Total: {{Dollar}}({{Number}})'
regex: true
dataType: decimal
- name: currency
objectType: field
fieldProperties:
fieldType: static
expression: USD
regex: true
- name: table1
objectType: table
tableProperties:
start:
expression: Description{{Spaces}}Hours
regex: true
end:
expression: 'Total:'
regex: true
row:
expression: '{{LineStart}}{{Spaces}}(?<description>{{SentenceWithSingleSpaces}}){{Spaces}}(?<hours>{{Digits}}){{Spaces}}(?<start>{{2Digits}}{{Minus}}{{2Digits}}{{Space}}{{2Digits}}{{Colon}}{{2Digits}}){{Spaces}}(?<end>{{2Digits}}{{Minus}}{{2Digits}}{{Space}}{{2Digits}}{{Colon}}{{2Digits}}){{Spaces}}{{Dollar}}(?<unitPrice>{{Number}})'
regex: true
columns:
- name: hours
dataType: integer
- name: unitPrice
dataType: decimal
Google.yml
templateName: Google Invoice
templateVersion: 4
templatePriority: 0
detectionRules:
keywords:
- Google
- 77-0493581
- Invoice
objects:
- name: invoiceId
objectType: field
fieldProperties:
expression: Invoice number:{{Spaces}}({{Digits}})
regex: true
- name: dateIssued
objectType: field
fieldProperties:
expression: Issue date:{{Spaces}}({{SmartDate}})
regex: true
dataType: date
dateFormat: MMM d, yyyy
- name: total
objectType: field
fieldProperties:
expression: Amount due in USD:{{Spaces}}{{Number}}
regex: true
dataType: decimal
- name: subTotal
objectType: field
fieldProperties:
expression: Subtotal in USD:{{Spaces}}{{Number}}
regex: true
dataType: decimal
- name: taxRate
objectType: field
fieldProperties:
expression: State sales tax {{OpeningParenthesis}}{{Digits}}{{Percent}}{{ClosingParenthesis}}
regex: true
dataType: integer
- name: tax
objectType: field
fieldProperties:
expression: State sales tax{{Anything}}{{Number}}{{LineEnd}}
regex: true
dataType: decimal
- name: companyName
objectType: field
fieldProperties:
fieldType: static
expression: Google LLC
regex: true
- name: billTo
objectType: field
fieldProperties:
fieldType: rectangle
regex: true
rectangle:
- 0
- 152
- 280
- 72
pageIndex: 0
- name: billingId
objectType: field
fieldProperties:
expression: Billing ID:{{Spaces}}({{DigitsOrSymbols}})
regex: true
- name: currency
objectType: field
fieldProperties:
fieldType: static
expression: USD
regex: true
- name: table1
objectType: table
tableProperties:
start:
expression: Description{{Spaces}}Interval{{Spaces}}Quantity{{Spaces}}Amount
regex: true
end:
expression: Subtotal in USD
regex: true
row:
expression: '{{LineStart}}{{Spaces}}(?<description>{{SentenceWithSingleSpaces}}){{Spaces}}(?<interval>{{3Letters}}{{Space}}{{Digits}}{{Space}}{{Minus}}{{Space}}{{3Letters}}{{Space}}{{Digits}}){{Spaces}}(?<quantity>{{Digits}}){{Spaces}}(?<amount>{{Number}})'
regex: true
columns:
- name: quantity
dataType: integer
- name: amount
dataType: decimal
Module1.vb
Imports System.Collections.Specialized
Imports System.IO
Imports System.Net
Imports System.Text
Imports System.Threading
Imports Newtonsoft.Json.Linq
Module Module1
' The authentication key (API Key).
' Get your own by registering at https://app.pdf.co
Const API_KEY As String = "********************************"
' Source PDF file
Const SourceFile As String = ".\DigitalOcean.pdf"
' Const SourceFile As String = ".\AmazonAWS.pdf"
' Const SourceFile As String = ".\Google.pdf"
' PDF document password. Leave empty for unprotected documents.
Const Password As String = ""
' Destination TXT file name
Const DestinationFile As String = ".\result.json"
' (!) Make asynchronous job
Const Async As Boolean = True
Sub Main()
' Template text. Use Document Parser (https://pdf.co/document-parser, https://app.pdf.co/document-parser)
' to create templates.
' Read template from file
Dim templateText As String = File.ReadAllText("DigitalOcean.yml")
' Dim templateText As String = File.ReadAllText("AmazonAWS.yml")
' Dim templateText As String = File.ReadAllText("Google.yml")
' Create standard .NET web client instance
Dim webClient As WebClient = New WebClient()
' Set API Key
webClient.Headers.Add("x-api-key", API_KEY)
' 1. RETRIEVE THE PRESIGNED URL TO UPLOAD THE FILE.
' * If you already have a direct file URL, skip to the step 3.
' Prepare URL for `Get Presigned URL` API call
Dim query As String = Uri.EscapeUriString(String.Format(
"https://api.pdf.co/v1/file/upload/get-presigned-url?contenttype=application/octet-stream&name={0}",
Path.GetFileName(SourceFile)))
Try
' Execute request
Dim response As String = webClient.DownloadString(query)
' Parse JSON response
Dim json As JObject = JObject.Parse(response)
If json("error").ToObject(Of Boolean) = False Then
' Get URL to use for the file upload
Dim uploadUrl As String = json("presignedUrl").ToString()
' Get URL of uploaded file to use with later API calls
Dim uploadedFileUrl As String = json("url").ToString()
' 2. UPLOAD THE FILE TO CLOUD.
webClient.Headers.Add("content-type", "application/octet-stream")
webClient.UploadFile(uploadUrl, "PUT", SourceFile) ' You can use UploadData() instead if your file is byte array or Stream
' 3. PARSE UPLOADED PDF DOCUMENT
' URL for `Document Parser` API call
query = Uri.EscapeUriString(
String.Format("https://api.pdf.co/v1/pdf/documentparser?url={0}&async={1}", uploadedFileUrl, Async)
)
Dim requestBody As New NameValueCollection()
requestBody.Add("template", templateText)
' Execute request
Dim responseBytes As Byte() = webClient.UploadValues(query, "POST", requestBody)
response = Encoding.UTF8.GetString(responseBytes)
' Parse JSON response
json = JObject.Parse(response)
If json("error").ToObject(Of Boolean) = False Then
' Asynchronous job ID
Dim jobId As String = json("jobId").ToString()
' URL of generated PDF file that will available after the job completion
Dim resultFileUrl As String = json("url").ToString()
' Check the job status in a loop.
' If you don't want to pause the main thread you can rework the code
' to use a separate thread for the status checking and completion.
Do
Dim status As String = CheckJobStatus(jobId) ' Possible statuses: "working", "failed", "aborted", "success".
' Display timestamp and status (for demo purposes)
Console.WriteLine(DateTime.Now.ToLongTimeString() + ": " + status)
If status = "success" Then
' Download PDF file
webClient.DownloadFile(resultFileUrl, DestinationFile)
Console.WriteLine("Generated JSON file saved as ""{0}"" file.", DestinationFile)
Exit Do
ElseIf status = "working" Then
' Pause for a few seconds
Thread.Sleep(3000)
Else
Console.WriteLine(status)
Exit Do
End If
Loop
Else
Console.WriteLine(json("message").ToString())
End If
End If
Catch ex As WebException
Console.WriteLine(ex.ToString())
End Try
webClient.Dispose()
Console.WriteLine()
Console.WriteLine("Press any key...")
Console.ReadKey()
End Sub
Function CheckJobStatus(jobId As String) As String
Using webClient As WebClient = New WebClient()
' Set API Key
webClient.Headers.Add("x-api-key", API_KEY)
Dim url As String = "https://api.pdf.co/v1/job/check?jobid=" + jobId
Dim response As String = webClient.DownloadString(url)
Dim json As JObject = JObject.Parse(response)
Return Convert.ToString(json("status"))
End Using
End Function
End Module
PDF.co Web API: the Web API with a set of tools for documents manipulation, data conversion, data extraction, splitting and merging of documents. Includes image recognition, built-in OCR, barcode generation and barcode decoders to decode bar codes from scans, pictures and pdf.
Download Source Code (.zip)
return to the previous page explore Document Parser endpoint
Copyright © 2016 - 2023 PDF.co