Invoice Data Extraction
Objective
Extract structured data from invoice PDFs including vendor information, line items, and totals. This enables automated invoice processing, expense tracking, and financial reporting.
Step 1: Create Invoices Table
Create a table for invoice documents.
CREATE TABLE invoice_documents (
id INTEGER PRIMARY KEY,
invoice_pdf PDF,
invoice_number VARCHAR(50) NOT NULL,
vendor_name VARCHAR(255) NOT NULL,
vendor_address TEXT,
vendor_tax_id VARCHAR(50),
invoice_date DATE NOT NULL,
due_date DATE,
subtotal DECIMAL(12, 2),
tax_amount DECIMAL(12, 2),
total_amount DECIMAL(12, 2) NOT NULL,
currency VARCHAR(3) DEFAULT 'USD',
payment_status VARCHAR(20) DEFAULT 'pending',
payment_date DATE,
extraction_status VARCHAR(20) DEFAULT 'pending',
extracted_at TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
Step 2: Create Line Items Table
Create a table for invoice line items.
CREATE TABLE invoice_line_items (
id INTEGER PRIMARY KEY,
invoice_id INTEGER NOT NULL,
line_number INTEGER NOT NULL,
description VARCHAR(500) NOT NULL,
quantity DECIMAL(10, 2) DEFAULT 1,
unit_price DECIMAL(12, 2) NOT NULL,
discount_percent DECIMAL(5, 2) DEFAULT 0,
tax_rate DECIMAL(5, 2) DEFAULT 0,
line_total DECIMAL(12, 2) NOT NULL,
category VARCHAR(50),
gl_code VARCHAR(20),
FOREIGN KEY (invoice_id) REFERENCES invoice_documents(id)
);
Step 3: Insert Sample Invoices
Add extracted invoice data.
INSERT INTO invoice_documents (id, invoice_number, vendor_name, vendor_address, vendor_tax_id, invoice_date, due_date, subtotal, tax_amount, total_amount, currency, payment_status, extraction_status, extracted_at) VALUES
(1, 'INV-2024-0001', 'Tech Solutions Inc', '123 Tech Park, San Jose, CA 95110', '12-3456789', '2024-01-15', '2024-02-15', 5000.00, 450.00, 5450.00, 'USD', 'paid', 'completed', '2024-01-16 10:00:00'),
(2, 'INV-2024-0045', 'Office Supplies Co', '456 Supply Lane, Austin, TX 78701', '98-7654321', '2024-01-20', '2024-02-20', 1250.00, 112.50, 1362.50, 'USD', 'paid', 'completed', '2024-01-21 09:30:00'),
(3, 'INV-2024-0089', 'Cloud Services Ltd', '789 Cloud Ave, Seattle, WA 98101', '45-6789012', '2024-02-01', '2024-03-01', 3500.00, 315.00, 3815.00, 'USD', 'pending', 'completed', '2024-02-02 14:15:00'),
(4, 'INV-2024-0102', 'Marketing Agency', '321 Creative Blvd, New York, NY 10001', '67-8901234', '2024-02-05', '2024-03-05', 8500.00, 765.00, 9265.00, 'USD', 'pending', 'completed', '2024-02-06 11:00:00'),
(5, 'INV-2024-0156', 'Consulting Group', '555 Advisory St, Chicago, IL 60601', '23-4567890', '2024-02-10', '2024-03-10', 12000.00, 1080.00, 13080.00, 'USD', 'pending', 'completed', '2024-02-11 16:30:00');
Step 4: Insert Line Items
Add invoice line items.
INSERT INTO invoice_line_items (id, invoice_id, line_number, description, quantity, unit_price, discount_percent, tax_rate, line_total, category, gl_code) VALUES
-- Tech Solutions invoice
(1, 1, 1, 'Software License - Annual', 5, 800.00, 0, 9.00, 4000.00, 'Software', '6100'),
(2, 1, 2, 'Implementation Services', 10, 100.00, 0, 9.00, 1000.00, 'Services', '6200'),
-- Office Supplies invoice
(3, 2, 1, 'Office Chairs - Ergonomic', 5, 200.00, 0, 9.00, 1000.00, 'Furniture', '6300'),
(4, 2, 2, 'Standing Desk Converters', 2, 125.00, 0, 9.00, 250.00, 'Furniture', '6300'),
-- Cloud Services invoice
(5, 3, 1, 'Cloud Hosting - Monthly', 1, 2500.00, 0, 9.00, 2500.00, 'Infrastructure', '6400'),
(6, 3, 2, 'Data Storage - 500GB', 1, 500.00, 0, 9.00, 500.00, 'Infrastructure', '6400'),
(7, 3, 3, 'Support Plan - Premium', 1, 500.00, 0, 9.00, 500.00, 'Support', '6500'),
-- Marketing Agency invoice
(8, 4, 1, 'Brand Strategy Development', 1, 5000.00, 0, 9.00, 5000.00, 'Marketing', '6600'),
(9, 4, 2, 'Website Redesign', 1, 3500.00, 0, 9.00, 3500.00, 'Marketing', '6600'),
-- Consulting invoice
(10, 5, 1, 'Strategic Consulting - 40hrs', 40, 250.00, 0, 9.00, 10000.00, 'Consulting', '6700'),
(11, 5, 2, 'Market Research Report', 1, 2000.00, 0, 9.00, 2000.00, 'Research', '6800');
Step 5: View Invoice Summary
Get invoice overview with line item counts.
SELECT
inv.invoice_number,
inv.vendor_name,
inv.invoice_date,
inv.total_amount,
inv.payment_status,
COUNT(li.id) as line_items
FROM invoice_documents inv
LEFT JOIN invoice_line_items li ON inv.id = li.invoice_id
GROUP BY inv.id, inv.invoice_number, inv.vendor_name, inv.invoice_date, inv.total_amount, inv.payment_status
ORDER BY inv.invoice_date DESC;
Step 6: Get Invoice Details
Retrieve full invoice with line items.
SELECT
inv.invoice_number,
inv.vendor_name,
li.line_number,
li.description,
li.quantity,
li.unit_price,
li.line_total,
li.category
FROM invoice_documents inv
INNER JOIN invoice_line_items li ON inv.id = li.invoice_id
WHERE inv.id = 1
ORDER BY li.line_number;
Step 7: Spending by Category
Analyze spending by expense category.
SELECT
li.category,
COUNT(DISTINCT inv.id) as invoice_count,
SUM(li.line_total) as total_spent,
AVG(li.line_total) as avg_line_amount
FROM invoice_documents inv
INNER JOIN invoice_line_items li ON inv.id = li.invoice_id
GROUP BY li.category
ORDER BY total_spent DESC;
Step 8: Vendor Analysis
Analyze spending by vendor.
SELECT
vendor_name,
COUNT(*) as invoice_count,
SUM(total_amount) as total_spent,
AVG(total_amount) as avg_invoice,
MIN(invoice_date) as first_invoice,
MAX(invoice_date) as last_invoice
FROM invoice_documents
GROUP BY vendor_name
ORDER BY total_spent DESC;
Step 9: Pending Payments
Get invoices pending payment.
SELECT
invoice_number,
vendor_name,
invoice_date,
due_date,
total_amount,
CASE
WHEN due_date < CURRENT_DATE THEN 'Overdue'
ELSE 'Due'
END as status
FROM invoice_documents
WHERE payment_status = 'pending'
ORDER BY due_date;
Step 10: Monthly Spending Report
Aggregate spending by month.
SELECT
invoice_date,
COUNT(*) as invoice_count,
SUM(subtotal) as subtotal,
SUM(tax_amount) as tax,
SUM(total_amount) as total
FROM invoice_documents
GROUP BY invoice_date
ORDER BY invoice_date;
Step 11: GL Code Summary
Summarize by general ledger code.
SELECT
li.gl_code,
li.category,
COUNT(*) as line_count,
SUM(li.line_total) as total_amount
FROM invoice_line_items li
GROUP BY li.gl_code, li.category
ORDER BY li.gl_code;
Step 12: Tax Summary
Calculate tax totals.
SELECT
COUNT(*) as total_invoices,
SUM(subtotal) as total_subtotal,
SUM(tax_amount) as total_tax,
SUM(total_amount) as grand_total,
AVG(tax_amount / NULLIF(subtotal, 0) * 100) as avg_tax_rate
FROM invoice_documents
WHERE extraction_status = 'completed';
Cleanup (Optional)
DROP TABLE IF EXISTS invoice_line_items;
DROP TABLE IF EXISTS invoice_documents;
Expected Outcomes
- Invoice data extracted from PDFs
- Line items stored and linked
- Category analysis works
- Vendor tracking enabled
- Financial reporting ready
Payment Status
| Status | Description |
|---|---|
| pending | Awaiting payment |
| paid | Payment complete |
| partial | Partially paid |
| overdue | Past due date |
Key Concepts Learned
- Structured data extraction
- Invoice line item management
- Financial aggregations
- Vendor analysis
- Payment tracking