Multi-Page PDF Processing
Objective
Process multi-page PDF documents with individual page tracking. This enables page-level search, navigation, and content analysis for large documents.
Step 1: Create Document Table
Create a table for multi-page documents.
CREATE TABLE multipage_documents (
id INTEGER PRIMARY KEY,
title VARCHAR(255) NOT NULL,
document PDF,
category VARCHAR(50),
total_pages INTEGER NOT NULL,
file_size BIGINT,
processing_status VARCHAR(20) DEFAULT 'pending',
pages_processed INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
Step 2: Create Pages Table
Create a table for individual pages.
CREATE TABLE document_pages (
id INTEGER PRIMARY KEY,
document_id INTEGER NOT NULL,
page_number INTEGER NOT NULL,
page_image IMAGE(PNG),
extracted_text TEXT,
word_count INTEGER DEFAULT 0,
has_images BOOLEAN DEFAULT FALSE,
has_tables BOOLEAN DEFAULT FALSE,
section_title VARCHAR(255),
processing_status VARCHAR(20) DEFAULT 'pending',
processed_at TIMESTAMP,
FOREIGN KEY (document_id) REFERENCES multipage_documents(id)
);
Step 3: Insert Sample Documents
Add multi-page documents.
INSERT INTO multipage_documents (id, title, category, total_pages, file_size, processing_status, pages_processed) VALUES
(1, 'Annual Report 2024', 'financial', 85, 15728640, 'completed', 85),
(2, 'Technical Manual v3', 'technical', 120, 20971520, 'completed', 120),
(3, 'Employee Handbook', 'hr', 45, 7340032, 'completed', 45),
(4, 'Research Study', 'research', 68, 11534336, 'processing', 34),
(5, 'Product Catalog', 'marketing', 150, 52428800, 'pending', 0);
Step 4: Insert Page Records
Add individual page records.
INSERT INTO document_pages (id, document_id, page_number, extracted_text, word_count, has_images, has_tables, section_title, processing_status, processed_at) VALUES
-- Annual Report pages
(1, 1, 1, 'Annual Report 2024 - Cover Page', 5, TRUE, FALSE, 'Cover', 'completed', '2024-01-15 10:00:00'),
(2, 1, 2, 'Table of Contents listing all sections and page numbers', 45, FALSE, TRUE, 'Table of Contents', 'completed', '2024-01-15 10:01:00'),
(3, 1, 3, 'Letter from the CEO discussing company performance and future outlook', 350, FALSE, FALSE, 'CEO Letter', 'completed', '2024-01-15 10:02:00'),
(4, 1, 4, 'Executive summary of financial highlights and key achievements', 420, TRUE, TRUE, 'Executive Summary', 'completed', '2024-01-15 10:03:00'),
(5, 1, 5, 'Revenue breakdown by product line and geographic region', 280, TRUE, TRUE, 'Financial Highlights', 'completed', '2024-01-15 10:04:00'),
-- Technical Manual pages
(6, 2, 1, 'Technical Manual - Version 3.0', 8, TRUE, FALSE, 'Cover', 'completed', '2024-01-16 09:00:00'),
(7, 2, 2, 'Introduction to the system architecture and components', 380, TRUE, FALSE, 'Introduction', 'completed', '2024-01-16 09:01:00'),
(8, 2, 3, 'System requirements including hardware and software dependencies', 290, FALSE, TRUE, 'Requirements', 'completed', '2024-01-16 09:02:00'),
(9, 2, 4, 'Installation procedures step by step guide', 450, TRUE, FALSE, 'Installation', 'completed', '2024-01-16 09:03:00'),
(10, 2, 5, 'Configuration options and settings reference', 520, FALSE, TRUE, 'Configuration', 'completed', '2024-01-16 09:04:00'),
-- Employee Handbook pages
(11, 3, 1, 'Employee Handbook - Welcome', 12, TRUE, FALSE, 'Welcome', 'completed', '2024-01-17 11:00:00'),
(12, 3, 2, 'Company history and mission statement', 280, FALSE, FALSE, 'About Us', 'completed', '2024-01-17 11:01:00'),
(13, 3, 3, 'Employment policies and procedures overview', 420, FALSE, TRUE, 'Policies', 'completed', '2024-01-17 11:02:00');
Step 5: Update Processing Status
Update document processing progress.
UPDATE multipage_documents
SET pages_processed = (
SELECT COUNT(*) FROM document_pages
WHERE document_pages.document_id = multipage_documents.id
AND document_pages.processing_status = 'completed'
);
SELECT title, total_pages, pages_processed FROM multipage_documents;
Step 6: Get Document Overview
View document with page summary.
SELECT
md.title,
md.category,
md.total_pages,
md.pages_processed,
md.file_size / 1048576 as size_mb,
md.processing_status,
CAST(md.pages_processed AS DECIMAL) / md.total_pages * 100 as progress_percent
FROM multipage_documents md
ORDER BY md.title;
Step 7: Get Page Content
Retrieve pages for a document.
SELECT
page_number,
section_title,
word_count,
has_images,
has_tables,
processing_status
FROM document_pages
WHERE document_id = 1
ORDER BY page_number;
Step 8: Search Across Pages
Search for content across all pages.
SELECT
md.title as document,
dp.page_number,
dp.section_title,
dp.extracted_text
FROM multipage_documents md
INNER JOIN document_pages dp ON md.id = dp.document_id
WHERE dp.extracted_text LIKE '%financial%'
OR dp.extracted_text LIKE '%revenue%'
ORDER BY md.title, dp.page_number;
Step 9: Pages with Tables
Find pages containing tables.
SELECT
md.title as document,
dp.page_number,
dp.section_title
FROM multipage_documents md
INNER JOIN document_pages dp ON md.id = dp.document_id
WHERE dp.has_tables = TRUE
ORDER BY md.title, dp.page_number;
Step 10: Content Distribution
Analyze content distribution by section.
SELECT
section_title,
COUNT(*) as page_count,
SUM(word_count) as total_words,
AVG(word_count) as avg_words
FROM document_pages
WHERE processing_status = 'completed'
GROUP BY section_title
ORDER BY page_count DESC;
Step 11: Processing Queue
Get pages pending processing.
SELECT
md.title as document,
dp.page_number,
dp.processing_status
FROM multipage_documents md
INNER JOIN document_pages dp ON md.id = dp.document_id
WHERE dp.processing_status IN ('pending', 'processing')
ORDER BY md.id, dp.page_number
LIMIT 10;
Step 12: Document Completeness
Check document processing completeness.
SELECT
md.title,
md.total_pages,
COUNT(CASE WHEN dp.processing_status = 'completed' THEN 1 END) as completed_pages,
COUNT(CASE WHEN dp.processing_status = 'pending' THEN 1 END) as pending_pages,
CASE
WHEN md.pages_processed = md.total_pages THEN 'Complete'
WHEN md.pages_processed > 0 THEN 'In Progress'
ELSE 'Not Started'
END as status
FROM multipage_documents md
LEFT JOIN document_pages dp ON md.id = dp.document_id
GROUP BY md.id, md.title, md.total_pages, md.pages_processed
ORDER BY md.title;
Cleanup (Optional)
DROP TABLE IF EXISTS document_pages;
DROP TABLE IF EXISTS multipage_documents;
Expected Outcomes
- Multi-page documents stored
- Individual pages tracked
- Page-level search works
- Processing status monitored
- Content analysis available
Page Content Types
| Type | Description |
|---|---|
| Cover | Title page |
| TOC | Table of contents |
| Content | Main content |
| Appendix | Supporting material |
| Index | Document index |
Key Concepts Learned
- Page-level document processing
- Progress tracking
- Multi-page search
- Content type identification
- Processing queues