Document Classification
Objective
Implement automatic document classification using AI embeddings and rule-based systems. This enables intelligent document routing, automated filing, and content organization.
Step 1: Create Categories Table
Create a table for document categories.
CREATE TABLE document_categories (
id INTEGER PRIMARY KEY,
name VARCHAR(100) NOT NULL,
description TEXT,
parent_id INTEGER,
keywords VARCHAR(500),
category_embedding VECTOR(384),
is_active BOOLEAN DEFAULT TRUE,
FOREIGN KEY (parent_id) REFERENCES document_categories(id)
);
Step 2: Create Classifiable Documents Table
Create a table for documents to classify.
CREATE TABLE classifiable_documents (
id INTEGER PRIMARY KEY,
document_pdf PDF,
filename VARCHAR(255) NOT NULL,
extracted_text TEXT,
content_embedding VECTOR(384),
predicted_category_id INTEGER,
prediction_confidence DECIMAL(5, 4),
actual_category_id INTEGER,
classification_status VARCHAR(20) DEFAULT 'pending',
classified_at TIMESTAMP,
verified_by VARCHAR(100),
verified_at TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (predicted_category_id) REFERENCES document_categories(id),
FOREIGN KEY (actual_category_id) REFERENCES document_categories(id)
);
Step 3: Create Classification Rules Table
Define rule-based classification.
CREATE TABLE classification_rules (
id INTEGER PRIMARY KEY,
category_id INTEGER NOT NULL,
rule_name VARCHAR(100) NOT NULL,
rule_type VARCHAR(20) NOT NULL,
pattern VARCHAR(500),
priority INTEGER DEFAULT 100,
is_active BOOLEAN DEFAULT TRUE,
FOREIGN KEY (category_id) REFERENCES document_categories(id)
);
Step 4: Insert Document Categories
Add classification categories.
INSERT INTO document_categories (id, name, description, parent_id, keywords) VALUES
-- Top-level categories
(1, 'Financial', 'Financial documents and reports', NULL, 'invoice,payment,budget,expense,revenue,financial'),
(2, 'Legal', 'Legal documents and contracts', NULL, 'contract,agreement,legal,terms,liability,court'),
(3, 'Human Resources', 'HR-related documents', NULL, 'employee,hiring,benefits,payroll,performance'),
(4, 'Technical', 'Technical documentation', NULL, 'specification,architecture,API,technical,system'),
(5, 'Marketing', 'Marketing and sales materials', NULL, 'campaign,brand,marketing,promotion,sales'),
-- Financial subcategories
(6, 'Invoices', 'Customer and vendor invoices', 1, 'invoice,bill,payment due,amount owed'),
(7, 'Reports', 'Financial reports and statements', 1, 'report,statement,quarterly,annual,P&L'),
(8, 'Budgets', 'Budget documents and forecasts', 1, 'budget,forecast,allocation,spending'),
-- Legal subcategories
(9, 'Contracts', 'Business contracts', 2, 'contract,agreement,parties,effective date'),
(10, 'Policies', 'Company policies', 2, 'policy,guideline,procedure,compliance'),
-- Technical subcategories
(11, 'Specifications', 'Product/system specifications', 4, 'specification,requirements,design,features'),
(12, 'Documentation', 'User and technical documentation', 4, 'documentation,manual,guide,instructions');
Step 5: Generate Category Embeddings
Create embeddings for categories.
UPDATE document_categories
SET category_embedding = EMBED(name || ' ' || COALESCE(description, '') || ' ' || COALESCE(keywords, ''))
WHERE category_embedding IS NULL;
Step 6: Insert Classification Rules
Add keyword-based rules.
INSERT INTO classification_rules (id, category_id, rule_name, rule_type, pattern, priority) VALUES
-- Invoice rules
(1, 6, 'Invoice Number Pattern', 'keyword', 'invoice number', 90),
(2, 6, 'Payment Due Pattern', 'keyword', 'payment due', 85),
(3, 6, 'Amount Due Pattern', 'keyword', 'amount due', 85),
-- Contract rules
(4, 9, 'Agreement Pattern', 'keyword', 'hereby agrees', 90),
(5, 9, 'Parties Pattern', 'keyword', 'between the parties', 88),
(6, 9, 'Effective Date Pattern', 'keyword', 'effective date', 85),
-- Report rules
(7, 7, 'Quarterly Report', 'keyword', 'quarterly report', 90),
(8, 7, 'Financial Statement', 'keyword', 'financial statement', 90),
-- Specification rules
(9, 11, 'Requirements Pattern', 'keyword', 'system requirements', 88),
(10, 11, 'Specification Pattern', 'keyword', 'technical specification', 90);
Step 7: Insert Sample Documents
Add documents for classification.
INSERT INTO classifiable_documents (id, filename, extracted_text, classification_status) VALUES
(1, 'doc_001.pdf', 'Invoice Number: INV-2024-0123. Payment due within 30 days. Amount due: $5,250.00. Please remit payment to the address below.', 'pending'),
(2, 'doc_002.pdf', 'This Agreement is entered into between Company A and Company B. The effective date of this contract is January 1, 2024. The parties hereby agree to the following terms and conditions.', 'pending'),
(3, 'doc_003.pdf', 'Q4 2023 Quarterly Report. Financial Statement Summary. Revenue increased 15% compared to previous quarter. Operating expenses remained stable.', 'pending'),
(4, 'doc_004.pdf', 'Technical Specification Document. System Requirements: The application shall support concurrent users up to 10,000. API response time must be under 200ms.', 'pending'),
(5, 'doc_005.pdf', 'Employee Handbook. Benefits information and payroll schedule. Performance review process and career development guidelines for all employees.', 'pending'),
(6, 'doc_006.pdf', 'Marketing Campaign Brief. Brand positioning strategy for Q1 2024. Target audience analysis and promotional channel recommendations.', 'pending');
Step 8: Generate Document Embeddings
Create embeddings for documents.
UPDATE classifiable_documents
SET content_embedding = EMBED(extracted_text)
WHERE content_embedding IS NULL
AND extracted_text IS NOT NULL;
Step 9: AI-Based Classification
Classify documents using embeddings.
SELECT
cd.id,
cd.filename,
dc.name as predicted_category,
COSINE_SIMILARITY(cd.content_embedding, dc.category_embedding) as confidence
FROM classifiable_documents cd
CROSS JOIN document_categories dc
WHERE cd.classification_status = 'pending'
AND cd.content_embedding IS NOT NULL
AND dc.category_embedding IS NOT NULL
AND dc.parent_id IS NOT NULL
ORDER BY cd.id, confidence DESC;
Step 10: Update Classifications
Apply best predictions.
UPDATE classifiable_documents cd
SET predicted_category_id = (
SELECT dc.id
FROM document_categories dc
WHERE dc.category_embedding IS NOT NULL
AND dc.parent_id IS NOT NULL
ORDER BY COSINE_SIMILARITY(cd.content_embedding, dc.category_embedding) DESC
LIMIT 1
),
prediction_confidence = (
SELECT MAX(COSINE_SIMILARITY(cd.content_embedding, dc.category_embedding))
FROM document_categories dc
WHERE dc.category_embedding IS NOT NULL
AND dc.parent_id IS NOT NULL
),
classification_status = 'classified',
classified_at = CURRENT_TIMESTAMP
WHERE cd.classification_status = 'pending'
AND cd.content_embedding IS NOT NULL;
SELECT cd.filename, dc.name as category, cd.prediction_confidence
FROM classifiable_documents cd
LEFT JOIN document_categories dc ON cd.predicted_category_id = dc.id;
Step 11: Low Confidence Review Queue
Get documents needing manual review.
SELECT
cd.id,
cd.filename,
dc.name as predicted_category,
cd.prediction_confidence,
'Manual Review Required' as status
FROM classifiable_documents cd
LEFT JOIN document_categories dc ON cd.predicted_category_id = dc.id
WHERE cd.prediction_confidence < 0.6
OR cd.predicted_category_id IS NULL
ORDER BY cd.prediction_confidence;
Step 12: Classification Accuracy Report
Analyze classification performance.
SELECT
dc.name as category,
COUNT(*) as document_count,
AVG(cd.prediction_confidence) as avg_confidence,
COUNT(CASE WHEN cd.prediction_confidence >= 0.7 THEN 1 END) as high_confidence,
COUNT(CASE WHEN cd.prediction_confidence < 0.5 THEN 1 END) as low_confidence
FROM classifiable_documents cd
INNER JOIN document_categories dc ON cd.predicted_category_id = dc.id
WHERE cd.classification_status = 'classified'
GROUP BY dc.id, dc.name
ORDER BY document_count DESC;
Cleanup (Optional)
DROP TABLE IF EXISTS classification_rules;
DROP TABLE IF EXISTS classifiable_documents;
DROP TABLE IF EXISTS document_categories;
Expected Outcomes
- Documents auto-classified
- Confidence scores assigned
- Low confidence flagged
- Categories organized
- Accuracy tracked
Classification Methods
| Method | Best For |
|---|---|
| AI Embedding | Semantic understanding |
| Keyword Rules | Specific patterns |
| Hybrid | Balanced accuracy |
Key Concepts Learned
- AI document classification
- Category embeddings
- Confidence thresholds
- Review queue management
- Classification accuracy