Vector Similarity Image Search
Objective
Build a semantic image search system using vector embeddings to find visually and conceptually similar images, enabling content-based image retrieval beyond simple keyword matching.
Step 1: Create Image Catalog Table
Store images with embeddings.
CREATE TABLE image_catalog (
id INTEGER PRIMARY KEY,
image_code VARCHAR(50) NOT NULL UNIQUE,
image_name VARCHAR(200),
description TEXT,
image_file IMAGE(JPEG),
thumbnail IMAGE(JPEG),
category VARCHAR(100),
tags TEXT,
file_size_mb DECIMAL(10, 2),
width INTEGER,
height INTEGER,
format VARCHAR(20),
image_embedding VECTOR(512),
description_embedding VECTOR(384),
indexed_at TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
Step 2: Create Image Collections Table
Organize images into sets.
CREATE TABLE image_collections (
id INTEGER PRIMARY KEY,
collection_code VARCHAR(50) NOT NULL UNIQUE,
collection_name VARCHAR(200),
description TEXT,
owner_id VARCHAR(50),
is_public BOOLEAN DEFAULT FALSE,
image_count INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE collection_images (
id INTEGER PRIMARY KEY,
collection_id INTEGER NOT NULL,
image_id INTEGER NOT NULL,
added_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (collection_id) REFERENCES image_collections(id),
FOREIGN KEY (image_id) REFERENCES image_catalog(id)
);
Step 3: Create Search History Table
Track search patterns.
CREATE TABLE image_search_history (
id INTEGER PRIMARY KEY,
user_id VARCHAR(50),
search_type VARCHAR(50),
query_text TEXT,
query_image_id INTEGER,
results_count INTEGER,
top_result_id INTEGER,
search_time_ms INTEGER,
searched_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (query_image_id) REFERENCES image_catalog(id),
FOREIGN KEY (top_result_id) REFERENCES image_catalog(id)
);
Step 4: Create Similar Images Cache Table
Cache similarity results.
CREATE TABLE similar_images_cache (
id INTEGER PRIMARY KEY,
source_image_id INTEGER NOT NULL,
similar_image_id INTEGER NOT NULL,
similarity_score DECIMAL(5, 4),
rank_position INTEGER,
cache_method VARCHAR(50),
cached_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
expires_at TIMESTAMP,
FOREIGN KEY (source_image_id) REFERENCES image_catalog(id),
FOREIGN KEY (similar_image_id) REFERENCES image_catalog(id)
);
Step 5: Insert Sample Images
Add image catalog.
INSERT INTO image_catalog (id, image_code, image_name, description, category, tags, file_size_mb, width, height, format, created_at) VALUES
(1, 'IMG-001', 'Mountain Sunset', 'Beautiful sunset over mountain range with orange and purple sky', 'nature', 'sunset,mountains,landscape,sky,orange', 4.5, 4000, 3000, 'JPEG', '2024-01-01 10:00:00'),
(2, 'IMG-002', 'Beach Sunrise', 'Calm beach scene with sunrise and gentle waves', 'nature', 'beach,sunrise,ocean,waves,calm', 3.8, 4000, 2667, 'JPEG', '2024-01-02 10:00:00'),
(3, 'IMG-003', 'City Skyline Night', 'Urban cityscape at night with illuminated buildings', 'urban', 'city,night,skyline,buildings,lights', 5.2, 5000, 3333, 'JPEG', '2024-01-03 10:00:00'),
(4, 'IMG-004', 'Forest Path', 'Winding path through autumn forest with fallen leaves', 'nature', 'forest,path,autumn,leaves,trees', 4.1, 4000, 3000, 'JPEG', '2024-01-04 10:00:00'),
(5, 'IMG-005', 'Desert Dunes', 'Sand dunes in golden hour light', 'nature', 'desert,dunes,sand,golden,landscape', 3.5, 4000, 2667, 'JPEG', '2024-01-05 10:00:00'),
(6, 'IMG-006', 'Snow Mountains', 'Snow-capped mountain peaks under blue sky', 'nature', 'mountains,snow,peaks,winter,blue', 4.8, 4500, 3000, 'JPEG', '2024-01-06 10:00:00'),
(7, 'IMG-007', 'Tropical Beach', 'Palm trees on white sand tropical beach', 'nature', 'beach,tropical,palm,sand,paradise', 3.9, 4000, 3000, 'JPEG', '2024-01-07 10:00:00'),
(8, 'IMG-008', 'Urban Street', 'Busy city street with people and shops', 'urban', 'city,street,people,shops,busy', 4.3, 4000, 2667, 'JPEG', '2024-01-08 10:00:00'),
(9, 'IMG-009', 'Waterfall', 'Majestic waterfall in tropical rainforest', 'nature', 'waterfall,rainforest,tropical,water,green', 4.6, 3000, 4000, 'JPEG', '2024-01-09 10:00:00'),
(10, 'IMG-010', 'Night Sky Stars', 'Milky Way visible in clear night sky', 'nature', 'stars,night,milkyway,sky,astronomy', 5.5, 5000, 3333, 'JPEG', '2024-01-10 10:00:00');
Step 6: Generate Embeddings
Create vector representations.
-- Update images with description embeddings
UPDATE image_catalog
SET description_embedding = EMBED(description)
WHERE description_embedding IS NULL;
-- Verify embedding generation
SELECT
image_code,
image_name,
CASE
WHEN description_embedding IS NOT NULL THEN 'Generated'
ELSE 'Pending'
END as embedding_status
FROM image_catalog
ORDER BY id;
Step 7: Insert Collections
Create image sets.
INSERT INTO image_collections (id, collection_code, collection_name, description, owner_id, is_public, image_count) VALUES
(1, 'COL-NATURE', 'Nature Photography', 'Collection of nature and landscape images', 'USR-001', TRUE, 7),
(2, 'COL-URBAN', 'Urban Scenes', 'City and urban photography', 'USR-001', TRUE, 2),
(3, 'COL-SUNSET', 'Sunset Collection', 'Images featuring sunsets and golden hour', 'USR-002', TRUE, 3);
INSERT INTO collection_images (id, collection_id, image_id) VALUES
(1, 1, 1), (2, 1, 2), (3, 1, 4), (4, 1, 5), (5, 1, 6), (6, 1, 7), (7, 1, 9),
(8, 2, 3), (9, 2, 8),
(10, 3, 1), (11, 3, 2), (12, 3, 5);
Step 8: Text-Based Semantic Search
Find images by description.
SELECT
ic.image_code,
ic.image_name,
ic.description,
ic.category,
COSINE_SIMILARITY(ic.description_embedding, EMBED('mountain landscape with dramatic sky')) as similarity
FROM image_catalog ic
WHERE ic.description_embedding IS NOT NULL
ORDER BY similarity DESC
LIMIT 5;
Step 9: Find Similar Images
Search by image similarity.
-- Find images similar to Mountain Sunset (IMG-001)
SELECT
target.image_code,
target.image_name,
target.description,
target.category,
COSINE_SIMILARITY(source.description_embedding, target.description_embedding) as similarity
FROM image_catalog source
CROSS JOIN image_catalog target
WHERE source.image_code = 'IMG-001'
AND target.image_code != 'IMG-001'
AND source.description_embedding IS NOT NULL
AND target.description_embedding IS NOT NULL
ORDER BY similarity DESC
LIMIT 5;
Step 10: Category-Filtered Search
Search within categories.
SELECT
ic.image_code,
ic.image_name,
ic.description,
COSINE_SIMILARITY(ic.description_embedding, EMBED('peaceful water scene')) as similarity
FROM image_catalog ic
WHERE ic.category = 'nature'
AND ic.description_embedding IS NOT NULL
ORDER BY similarity DESC
LIMIT 5;
Step 11: Cache Similar Images
Pre-compute similarities.
-- Calculate and cache similarities for frequently accessed images
INSERT INTO similar_images_cache (source_image_id, similar_image_id, similarity_score, rank_position, cache_method, expires_at)
SELECT
source.id as source_image_id,
target.id as similar_image_id,
COSINE_SIMILARITY(source.description_embedding, target.description_embedding) as similarity_score,
ROW_NUMBER() OVER (PARTITION BY source.id ORDER BY COSINE_SIMILARITY(source.description_embedding, target.description_embedding) DESC) as rank_position,
'batch_compute' as cache_method,
CURRENT_TIMESTAMP + INTERVAL '7 days' as expires_at
FROM image_catalog source
CROSS JOIN image_catalog target
WHERE source.id != target.id
AND source.description_embedding IS NOT NULL
AND target.description_embedding IS NOT NULL
AND source.id = 1;
Step 12: Retrieve Cached Similarities
Use pre-computed results.
SELECT
ic.image_code,
ic.image_name,
ic.description,
sic.similarity_score,
sic.rank_position
FROM similar_images_cache sic
INNER JOIN image_catalog ic ON sic.similar_image_id = ic.id
WHERE sic.source_image_id = 1
AND sic.expires_at > CURRENT_TIMESTAMP
ORDER BY sic.rank_position
LIMIT 5;
Step 13: Log Search Queries
Track search history.
INSERT INTO image_search_history (user_id, search_type, query_text, results_count, top_result_id, search_time_ms)
VALUES ('USR-001', 'text', 'mountain landscape with dramatic sky', 5, 1, 125);
INSERT INTO image_search_history (user_id, search_type, query_image_id, results_count, top_result_id, search_time_ms)
VALUES ('USR-001', 'image', 1, 5, 6, 85);
Step 14: Search Analytics
Analyze search patterns.
SELECT
search_type,
COUNT(*) as search_count,
AVG(results_count) as avg_results,
AVG(search_time_ms) as avg_time_ms
FROM image_search_history
WHERE searched_at >= CURRENT_DATE - 30
GROUP BY search_type
ORDER BY search_count DESC;
Step 15: Collection Similarity Analysis
Find similar collections.
SELECT
ic1.collection_name as collection,
ic.image_name,
COSINE_SIMILARITY(ic.description_embedding, EMBED('tropical paradise vacation')) as relevance
FROM collection_images ci
INNER JOIN image_collections ic1 ON ci.collection_id = ic1.id
INNER JOIN image_catalog ic ON ci.image_id = ic.id
WHERE ic.description_embedding IS NOT NULL
ORDER BY relevance DESC
LIMIT 10;
Cleanup (Optional)
DROP TABLE IF EXISTS similar_images_cache;
DROP TABLE IF EXISTS image_search_history;
DROP TABLE IF EXISTS collection_images;
DROP TABLE IF EXISTS image_collections;
DROP TABLE IF EXISTS image_catalog;
Expected Outcomes
- Images indexed with embeddings
- Semantic search enabled
- Similar images found
- Results cached
- Search history tracked
Similarity Score Interpretation
| Score Range | Interpretation |
|---|---|
| 0.90-1.00 | Nearly identical |
| 0.75-0.89 | Very similar |
| 0.60-0.74 | Related |
| 0.40-0.59 | Loosely related |
| < 0.40 | Different |
Key Concepts Learned
- Vector embeddings
- EMBED() function
- COSINE_SIMILARITY()
- Semantic search
- Similarity caching