Audio Language Detection
Objective
Create a system for storing audio files with detected language information. This enables building multilingual audio libraries with automatic language classification from transcription.
Step 1: Create Multilingual Audio Table
Create a table for audio with language detection.
CREATE TABLE multilingual_audio (
id INTEGER PRIMARY KEY,
title VARCHAR(255) NOT NULL,
audio_file AUDIO(MP3),
content_type VARCHAR(50),
original_language VARCHAR(10),
detected_language VARCHAR(10),
language_confidence DECIMAL(5, 4),
transcript TEXT,
duration_seconds INTEGER,
region VARCHAR(50),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
Step 2: Create Language Reference Table
Create a reference table for supported languages.
CREATE TABLE supported_languages (
language_code VARCHAR(10) PRIMARY KEY,
language_name VARCHAR(100) NOT NULL,
native_name VARCHAR(100),
region VARCHAR(50),
is_rtl BOOLEAN DEFAULT FALSE
);
INSERT INTO supported_languages (language_code, language_name, native_name, region, is_rtl) VALUES
('en', 'English', 'English', 'Global', FALSE),
('es', 'Spanish', 'Español', 'Latin America/Spain', FALSE),
('fr', 'French', 'Français', 'France/Africa', FALSE),
('de', 'German', 'Deutsch', 'Germany/Austria', FALSE),
('zh', 'Chinese', '中文', 'China/Taiwan', FALSE),
('ja', 'Japanese', '日本語', 'Japan', FALSE),
('ko', 'Korean', '한국어', 'Korea', FALSE),
('pt', 'Portuguese', 'Português', 'Brazil/Portugal', FALSE),
('ar', 'Arabic', 'العربية', 'Middle East', TRUE),
('hi', 'Hindi', 'हिन्दी', 'India', FALSE),
('ru', 'Russian', 'Русский', 'Russia', FALSE),
('it', 'Italian', 'Italiano', 'Italy', FALSE);
Step 3: Insert Sample Multilingual Content
Add sample audio content in various languages.
INSERT INTO multilingual_audio (id, title, content_type, original_language, detected_language, language_confidence, duration_seconds, region) VALUES
(1, 'English Tutorial - Python Basics', 'tutorial', 'en', 'en', 0.9876, 1800, 'Global'),
(2, 'Spanish Podcast - Tecnología', 'podcast', 'es', 'es', 0.9654, 2400, 'Spain'),
(3, 'French News Broadcast', 'news', 'fr', 'fr', 0.9789, 600, 'France'),
(4, 'German Business Lecture', 'lecture', 'de', 'de', 0.9523, 3000, 'Germany'),
(5, 'Chinese Language Lesson', 'education', 'zh', 'zh', 0.9412, 1500, 'China'),
(6, 'Japanese Podcast - Culture', 'podcast', 'ja', 'ja', 0.9567, 2100, 'Japan'),
(7, 'Portuguese Interview', 'interview', 'pt', 'pt', 0.9345, 1800, 'Brazil'),
(8, 'Arabic News Report', 'news', 'ar', 'ar', 0.9234, 900, 'Middle East'),
(9, 'Hindi Documentary Narration', 'documentary', 'hi', 'hi', 0.9123, 3600, 'India'),
(10, 'Korean Tech Review', 'review', 'ko', 'ko', 0.9456, 1200, 'Korea'),
(11, 'English News - International', 'news', 'en', 'en', 0.9812, 720, 'UK'),
(12, 'Spanish Tutorial - Web Dev', 'tutorial', 'es', 'es', 0.9687, 2700, 'Mexico');
Step 4: Transcribe with Language Detection
Perform transcription and language detection.
UPDATE multilingual_audio
SET transcript = TRANSCRIBE(audio_file),
detected_language = 'en'
WHERE audio_file IS NOT NULL AND transcript IS NULL;
Step 5: Query by Language
Find all audio in a specific language.
SELECT
title,
content_type,
duration_seconds / 60 as duration_minutes,
language_confidence,
region
FROM multilingual_audio
WHERE detected_language = 'en'
ORDER BY language_confidence DESC;
Step 6: Language Distribution Analysis
Analyze content distribution by language.
SELECT
ma.detected_language,
sl.language_name,
COUNT(*) as audio_count,
SUM(ma.duration_seconds) / 3600 as total_hours,
AVG(ma.language_confidence) as avg_confidence
FROM multilingual_audio ma
LEFT JOIN supported_languages sl ON ma.detected_language = sl.language_code
GROUP BY ma.detected_language, sl.language_name
ORDER BY audio_count DESC;
Step 7: Content by Region
Group content by region.
SELECT
region,
COUNT(*) as audio_count,
COUNT(DISTINCT detected_language) as languages,
SUM(duration_seconds) / 60 as total_minutes
FROM multilingual_audio
GROUP BY region
ORDER BY audio_count DESC;
Step 8: Low Confidence Detections
Find audio with uncertain language detection.
SELECT
id,
title,
original_language,
detected_language,
language_confidence,
'Review needed' as status
FROM multilingual_audio
WHERE language_confidence < 0.95
ORDER BY language_confidence;
Step 9: Language Mismatch Detection
Find where detected differs from original.
SELECT
title,
original_language,
detected_language,
language_confidence
FROM multilingual_audio
WHERE original_language != detected_language
ORDER BY language_confidence;
Step 10: Content Type by Language
Analyze content types per language.
SELECT
detected_language,
content_type,
COUNT(*) as count
FROM multilingual_audio
GROUP BY detected_language, content_type
ORDER BY detected_language, count DESC;
Step 11: Recently Added by Language
Get recent content for each language.
SELECT
detected_language,
title,
content_type,
created_at
FROM multilingual_audio ma1
WHERE created_at = (
SELECT MAX(created_at)
FROM multilingual_audio ma2
WHERE ma1.detected_language = ma2.detected_language
)
ORDER BY detected_language;
Step 12: Multilingual Coverage Report
Generate coverage report for localization.
SELECT
sl.language_code,
sl.language_name,
COALESCE(audio_stats.audio_count, 0) as audio_count,
COALESCE(audio_stats.total_hours, 0) as total_hours,
CASE
WHEN audio_stats.audio_count >= 3 THEN 'Good'
WHEN audio_stats.audio_count >= 1 THEN 'Limited'
ELSE 'Missing'
END as coverage_status
FROM supported_languages sl
LEFT JOIN (
SELECT
detected_language,
COUNT(*) as audio_count,
SUM(duration_seconds) / 3600 as total_hours
FROM multilingual_audio
GROUP BY detected_language
) audio_stats ON sl.language_code = audio_stats.detected_language
ORDER BY audio_stats.audio_count DESC;
Cleanup (Optional)
DROP TABLE IF EXISTS multilingual_audio;
DROP TABLE IF EXISTS supported_languages;
Expected Outcomes
- Language detected from transcription
- Content organized by language
- Regional distribution tracked
- Low confidence items flagged
- Coverage gaps identified
Supported Languages
| Code | Language | Region |
|---|---|---|
| en | English | Global |
| es | Spanish | Americas/Spain |
| fr | French | France/Africa |
| de | German | Europe |
| zh | Chinese | Asia |
| ja | Japanese | Japan |
| ko | Korean | Korea |
| pt | Portuguese | Brazil/Portugal |
Key Concepts Learned
- Language detection from audio
- Confidence score tracking
- Multilingual content organization
- Coverage analysis queries
- Language reference tables