Form Data Extraction
Objective
Extract and store structured data from PDF forms with field mapping and validation. This enables automating form processing, data entry, and document digitization workflows.
Step 1: Create Form Templates Table
Create a table for form templates.
CREATE TABLE form_templates (
id INTEGER PRIMARY KEY,
template_name VARCHAR(255) NOT NULL,
form_type VARCHAR(50) NOT NULL,
version VARCHAR(20) DEFAULT '1.0',
field_count INTEGER DEFAULT 0,
description TEXT,
is_active BOOLEAN DEFAULT TRUE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
Step 2: Create Form Fields Table
Define expected form fields.
CREATE TABLE form_fields (
id INTEGER PRIMARY KEY,
template_id INTEGER NOT NULL,
field_name VARCHAR(100) NOT NULL,
field_label VARCHAR(255),
field_type VARCHAR(50) NOT NULL,
is_required BOOLEAN DEFAULT FALSE,
validation_pattern VARCHAR(255),
default_value VARCHAR(255),
field_order INTEGER,
FOREIGN KEY (template_id) REFERENCES form_templates(id)
);
Step 3: Create Submitted Forms Table
Store submitted form data.
CREATE TABLE submitted_forms (
id INTEGER PRIMARY KEY,
template_id INTEGER NOT NULL,
form_pdf PDF,
submission_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
submitter_name VARCHAR(255),
submitter_email VARCHAR(255),
extraction_status VARCHAR(20) DEFAULT 'pending',
validation_status VARCHAR(20) DEFAULT 'pending',
validation_errors INTEGER DEFAULT 0,
processed_by VARCHAR(100),
processed_at TIMESTAMP,
FOREIGN KEY (template_id) REFERENCES form_templates(id)
);
Step 4: Create Extracted Values Table
Store extracted field values.
CREATE TABLE extracted_values (
id INTEGER PRIMARY KEY,
form_id INTEGER NOT NULL,
field_id INTEGER NOT NULL,
extracted_value TEXT,
confidence_score DECIMAL(5, 4),
is_valid BOOLEAN,
validation_message VARCHAR(255),
manually_corrected BOOLEAN DEFAULT FALSE,
corrected_value TEXT,
FOREIGN KEY (form_id) REFERENCES submitted_forms(id),
FOREIGN KEY (field_id) REFERENCES form_fields(id)
);
Step 5: Insert Form Templates
Add sample form templates.
INSERT INTO form_templates (id, template_name, form_type, version, field_count, description) VALUES
(1, 'Employee Onboarding Form', 'hr', '2.0', 12, 'New employee information collection form'),
(2, 'Expense Report Form', 'finance', '1.5', 10, 'Employee expense reimbursement request'),
(3, 'Customer Registration Form', 'customer', '1.0', 8, 'New customer account registration'),
(4, 'Leave Request Form', 'hr', '1.2', 6, 'Employee time off request form'),
(5, 'Vendor Registration Form', 'procurement', '1.0', 15, 'New vendor onboarding form');
Step 6: Insert Form Fields
Define fields for templates.
INSERT INTO form_fields (id, template_id, field_name, field_label, field_type, is_required, validation_pattern, field_order) VALUES
-- Employee Onboarding Form fields
(1, 1, 'full_name', 'Full Legal Name', 'text', TRUE, NULL, 1),
(2, 1, 'email', 'Email Address', 'email', TRUE, '^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$', 2),
(3, 1, 'phone', 'Phone Number', 'phone', TRUE, '^[0-9]{10}$', 3),
(4, 1, 'ssn', 'Social Security Number', 'ssn', TRUE, '^[0-9]{3}-[0-9]{2}-[0-9]{4}$', 4),
(5, 1, 'start_date', 'Start Date', 'date', TRUE, NULL, 5),
(6, 1, 'department', 'Department', 'text', TRUE, NULL, 6),
(7, 1, 'job_title', 'Job Title', 'text', TRUE, NULL, 7),
(8, 1, 'manager_name', 'Manager Name', 'text', FALSE, NULL, 8),
-- Expense Report Form fields
(9, 2, 'employee_name', 'Employee Name', 'text', TRUE, NULL, 1),
(10, 2, 'employee_id', 'Employee ID', 'text', TRUE, '^EMP[0-9]{5}$', 2),
(11, 2, 'expense_date', 'Expense Date', 'date', TRUE, NULL, 3),
(12, 2, 'expense_type', 'Expense Type', 'select', TRUE, NULL, 4),
(13, 2, 'amount', 'Amount', 'currency', TRUE, '^[0-9]+\\.?[0-9]{0,2}$', 5),
(14, 2, 'description', 'Description', 'text', TRUE, NULL, 6),
-- Leave Request Form fields
(15, 4, 'employee_name', 'Employee Name', 'text', TRUE, NULL, 1),
(16, 4, 'leave_type', 'Leave Type', 'select', TRUE, NULL, 2),
(17, 4, 'start_date', 'Start Date', 'date', TRUE, NULL, 3),
(18, 4, 'end_date', 'End Date', 'date', TRUE, NULL, 4),
(19, 4, 'total_days', 'Total Days', 'number', TRUE, NULL, 5),
(20, 4, 'reason', 'Reason', 'text', FALSE, NULL, 6);
Step 7: Insert Submitted Forms
Add sample form submissions.
INSERT INTO submitted_forms (id, template_id, submission_date, submitter_name, submitter_email, extraction_status, validation_status) VALUES
(1, 1, '2024-01-15 10:30:00', 'HR Admin', 'hr@company.com', 'completed', 'valid'),
(2, 1, '2024-01-16 14:45:00', 'HR Admin', 'hr@company.com', 'completed', 'valid'),
(3, 2, '2024-01-17 09:15:00', 'John Employee', 'john@company.com', 'completed', 'valid'),
(4, 2, '2024-01-18 11:00:00', 'Jane Employee', 'jane@company.com', 'completed', 'invalid'),
(5, 4, '2024-01-19 08:30:00', 'Bob Employee', 'bob@company.com', 'pending', 'pending'),
(6, 1, '2024-01-20 16:00:00', 'HR Admin', 'hr@company.com', 'processing', 'pending');
Step 8: Insert Extracted Values
Add extracted field values.
INSERT INTO extracted_values (id, form_id, field_id, extracted_value, confidence_score, is_valid, validation_message) VALUES
-- Onboarding Form 1 values
(1, 1, 1, 'Michael Johnson', 0.9856, TRUE, NULL),
(2, 1, 2, 'michael.johnson@email.com', 0.9723, TRUE, NULL),
(3, 1, 3, '5551234567', 0.9512, TRUE, NULL),
(4, 1, 4, '123-45-6789', 0.9634, TRUE, NULL),
(5, 1, 5, '2024-02-01', 0.9845, TRUE, NULL),
(6, 1, 6, 'Engineering', 0.9567, TRUE, NULL),
(7, 1, 7, 'Software Engineer', 0.9489, TRUE, NULL),
-- Expense Report values
(8, 3, 9, 'John Employee', 0.9756, TRUE, NULL),
(9, 3, 10, 'EMP00123', 0.9623, TRUE, NULL),
(10, 3, 11, '2024-01-10', 0.9534, TRUE, NULL),
(11, 3, 12, 'Travel', 0.9412, TRUE, NULL),
(12, 3, 13, '450.00', 0.9678, TRUE, NULL),
(13, 3, 14, 'Client meeting in Chicago', 0.9345, TRUE, NULL),
-- Invalid expense report
(14, 4, 9, 'Jane Employee', 0.9234, TRUE, NULL),
(15, 4, 10, 'EMP456', 0.8123, FALSE, 'Invalid employee ID format'),
(16, 4, 13, '1,250.50', 0.7845, FALSE, 'Invalid currency format');
Step 9: Get Form with Extracted Data
View form submission with values.
SELECT
sf.id as form_id,
ft.template_name,
ff.field_label,
ev.extracted_value,
ev.confidence_score,
ev.is_valid
FROM submitted_forms sf
INNER JOIN form_templates ft ON sf.template_id = ft.id
INNER JOIN extracted_values ev ON sf.id = ev.form_id
INNER JOIN form_fields ff ON ev.field_id = ff.id
WHERE sf.id = 1
ORDER BY ff.field_order;
Step 10: Validation Summary
Get forms with validation issues.
SELECT
sf.id,
ft.template_name,
sf.submitter_name,
sf.submission_date,
COUNT(ev.id) as total_fields,
SUM(CASE WHEN ev.is_valid = FALSE THEN 1 ELSE 0 END) as invalid_fields,
sf.validation_status
FROM submitted_forms sf
INNER JOIN form_templates ft ON sf.template_id = ft.id
LEFT JOIN extracted_values ev ON sf.id = ev.form_id
GROUP BY sf.id, ft.template_name, sf.submitter_name, sf.submission_date, sf.validation_status
ORDER BY invalid_fields DESC;
Step 11: Low Confidence Extractions
Find values needing manual review.
SELECT
sf.id as form_id,
ft.template_name,
ff.field_label,
ev.extracted_value,
ev.confidence_score,
'Manual Review Needed' as action
FROM submitted_forms sf
INNER JOIN form_templates ft ON sf.template_id = ft.id
INNER JOIN extracted_values ev ON sf.id = ev.form_id
INNER JOIN form_fields ff ON ev.field_id = ff.id
WHERE ev.confidence_score < 0.9
ORDER BY ev.confidence_score;
Step 12: Extraction Performance
Analyze extraction accuracy by form type.
SELECT
ft.form_type,
ft.template_name,
COUNT(DISTINCT sf.id) as forms_processed,
AVG(ev.confidence_score) as avg_confidence,
SUM(CASE WHEN ev.is_valid THEN 1 ELSE 0 END) * 100.0 / COUNT(ev.id) as accuracy_percent
FROM form_templates ft
INNER JOIN submitted_forms sf ON ft.id = sf.template_id
INNER JOIN extracted_values ev ON sf.id = ev.form_id
WHERE sf.extraction_status = 'completed'
GROUP BY ft.id, ft.form_type, ft.template_name
ORDER BY accuracy_percent DESC;
Cleanup (Optional)
DROP TABLE IF EXISTS extracted_values;
DROP TABLE IF EXISTS submitted_forms;
DROP TABLE IF EXISTS form_fields;
DROP TABLE IF EXISTS form_templates;
Expected Outcomes
- Form fields extracted
- Values validated
- Low confidence flagged
- Manual corrections supported
- Accuracy metrics tracked
Field Types
| Type | Validation |
|---|---|
| text | No format |
| Email pattern | |
| phone | Phone pattern |
| date | Date format |
| currency | Number format |
| ssn | SSN pattern |
Key Concepts Learned
- Form template definition
- Field extraction
- Validation patterns
- Confidence scoring
- Manual review workflow