Academic Paper Repository
Objective
Create a research paper repository with citation tracking and semantic search. This enables researchers to discover related papers, track citations, and explore research topics efficiently.
Step 1: Create Papers Table
Create a table for research papers.
CREATE TABLE research_papers (
id INTEGER PRIMARY KEY,
paper_pdf PDF,
title VARCHAR(500) NOT NULL,
abstract TEXT,
publication_type VARCHAR(50),
journal_name VARCHAR(255),
conference_name VARCHAR(255),
publication_date DATE,
doi VARCHAR(100) UNIQUE,
keywords VARCHAR(500),
field_of_study VARCHAR(100),
citation_count INTEGER DEFAULT 0,
abstract_embedding VECTOR(384),
is_open_access BOOLEAN DEFAULT FALSE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
Step 2: Create Authors Table
Create a table for paper authors.
CREATE TABLE paper_authors (
id INTEGER PRIMARY KEY,
paper_id INTEGER NOT NULL,
author_name VARCHAR(255) NOT NULL,
author_order INTEGER NOT NULL,
affiliation VARCHAR(255),
email VARCHAR(255),
orcid VARCHAR(50),
is_corresponding BOOLEAN DEFAULT FALSE,
FOREIGN KEY (paper_id) REFERENCES research_papers(id)
);
Step 3: Create Citations Table
Track paper citations.
CREATE TABLE paper_citations (
id INTEGER PRIMARY KEY,
citing_paper_id INTEGER NOT NULL,
cited_paper_id INTEGER NOT NULL,
citation_context TEXT,
citation_type VARCHAR(50),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (citing_paper_id) REFERENCES research_papers(id),
FOREIGN KEY (cited_paper_id) REFERENCES research_papers(id)
);
Step 4: Insert Sample Papers
Add research paper records.
INSERT INTO research_papers (id, title, abstract, publication_type, journal_name, publication_date, doi, keywords, field_of_study, citation_count, is_open_access) VALUES
(1, 'Deep Learning for Natural Language Processing: A Comprehensive Survey',
'This survey provides a comprehensive review of deep learning techniques applied to natural language processing tasks including text classification, named entity recognition, and machine translation.',
'journal', 'Journal of Machine Learning Research', '2023-06-15', '10.1234/jmlr.2023.001',
'deep learning,NLP,neural networks,transformers', 'Computer Science', 245, TRUE),
(2, 'Transformer Architectures: Attention Mechanisms and Beyond',
'We present an analysis of transformer architectures focusing on self-attention mechanisms and their variants. Our study examines computational efficiency and performance trade-offs.',
'conference', NULL, '2023-09-20', '10.1234/conf.2023.002',
'transformers,attention,deep learning,efficiency', 'Computer Science', 180, TRUE),
(3, 'Federated Learning for Healthcare: Privacy-Preserving Machine Learning',
'This paper explores federated learning applications in healthcare settings, enabling collaborative model training while preserving patient data privacy.',
'journal', 'Nature Medicine AI', '2023-11-10', '10.1234/nmai.2023.003',
'federated learning,healthcare,privacy,machine learning', 'Healthcare AI', 95, FALSE),
(4, 'Graph Neural Networks for Molecular Property Prediction',
'We introduce a novel graph neural network architecture for predicting molecular properties with applications in drug discovery and materials science.',
'journal', 'Journal of Chemical Information', '2023-08-05', '10.1234/jci.2023.004',
'graph neural networks,molecular,drug discovery,chemistry', 'Computational Chemistry', 120, TRUE),
(5, 'Reinforcement Learning in Robotics: A Practical Guide',
'A practical guide to implementing reinforcement learning algorithms for robotic control tasks, covering simulation-to-reality transfer and sample efficiency.',
'conference', NULL, '2024-01-15', '10.1234/conf.2024.005',
'reinforcement learning,robotics,control,simulation', 'Robotics', 45, TRUE),
(6, 'Large Language Models: Capabilities and Limitations',
'We analyze the capabilities and limitations of large language models, examining emergent abilities, reasoning capabilities, and failure modes.',
'journal', 'AI Magazine', '2024-02-01', '10.1234/aim.2024.006',
'large language models,GPT,emergent abilities,AI safety', 'Computer Science', 320, TRUE);
Step 5: Insert Authors
Add paper authors.
INSERT INTO paper_authors (id, paper_id, author_name, author_order, affiliation, orcid, is_corresponding) VALUES
-- NLP Survey authors
(1, 1, 'Dr. Sarah Chen', 1, 'Stanford University', '0000-0001-1234-5678', TRUE),
(2, 1, 'Prof. Michael Zhang', 2, 'MIT', '0000-0002-2345-6789', FALSE),
(3, 1, 'Dr. Emily Watson', 3, 'Google Research', NULL, FALSE),
-- Transformer paper authors
(4, 2, 'Prof. Michael Zhang', 1, 'MIT', '0000-0002-2345-6789', TRUE),
(5, 2, 'Dr. Alex Kim', 2, 'OpenAI', NULL, FALSE),
-- Federated Learning authors
(6, 3, 'Dr. Rachel Green', 1, 'Johns Hopkins', '0000-0003-3456-7890', TRUE),
(7, 3, 'Dr. James Wilson', 2, 'Mayo Clinic', NULL, FALSE),
-- GNN paper authors
(8, 4, 'Dr. David Liu', 1, 'Caltech', '0000-0004-4567-8901', TRUE),
(9, 4, 'Prof. Lisa Park', 2, 'UC Berkeley', NULL, FALSE),
-- LLM paper authors
(10, 6, 'Dr. Sarah Chen', 1, 'Stanford University', '0000-0001-1234-5678', FALSE),
(11, 6, 'Dr. Alex Kim', 2, 'OpenAI', NULL, TRUE),
(12, 6, 'Prof. Robert Brown', 3, 'DeepMind', NULL, FALSE);
Step 6: Insert Citations
Add citation relationships.
INSERT INTO paper_citations (id, citing_paper_id, cited_paper_id, citation_context, citation_type) VALUES
(1, 2, 1, 'Building on the comprehensive survey by Chen et al.', 'foundational'),
(2, 6, 1, 'Previous work has extensively surveyed NLP applications', 'background'),
(3, 6, 2, 'Transformer architectures have become the dominant paradigm', 'foundational'),
(4, 3, 1, 'Deep learning methods have shown promise in healthcare', 'methodology'),
(5, 5, 2, 'Attention mechanisms have been adapted for robotics', 'comparison');
Step 7: Generate Embeddings
Create embeddings for semantic search.
UPDATE research_papers
SET abstract_embedding = EMBED(abstract)
WHERE abstract_embedding IS NULL;
Step 8: Semantic Paper Search
Search papers by research topic.
SELECT
title,
field_of_study,
publication_date,
citation_count,
COSINE_SIMILARITY(abstract_embedding, EMBED('machine learning for medical diagnosis')) as relevance
FROM research_papers
WHERE abstract_embedding IS NOT NULL
ORDER BY relevance DESC
LIMIT 5;
Step 9: Paper with Authors
Get paper details with author list.
SELECT
rp.title,
rp.journal_name,
rp.publication_date,
pa.author_name,
pa.affiliation,
pa.is_corresponding
FROM research_papers rp
INNER JOIN paper_authors pa ON rp.id = pa.paper_id
WHERE rp.id = 1
ORDER BY pa.author_order;
Step 10: Most Cited Papers
Get highly cited papers.
SELECT
title,
field_of_study,
publication_date,
citation_count,
is_open_access
FROM research_papers
ORDER BY citation_count DESC
LIMIT 10;
Step 11: Author Publication Record
Get papers by author.
SELECT
pa.author_name,
pa.affiliation,
COUNT(DISTINCT rp.id) as paper_count,
SUM(rp.citation_count) as total_citations,
AVG(rp.citation_count) as avg_citations
FROM paper_authors pa
INNER JOIN research_papers rp ON pa.paper_id = rp.id
GROUP BY pa.author_name, pa.affiliation
ORDER BY total_citations DESC;
Step 12: Find Related Papers
Get papers similar to a specific one.
SELECT
b.title,
b.field_of_study,
b.citation_count,
COSINE_SIMILARITY(a.abstract_embedding, b.abstract_embedding) as similarity
FROM research_papers a
CROSS JOIN research_papers b
WHERE a.id = 1
AND b.id != 1
AND a.abstract_embedding IS NOT NULL
AND b.abstract_embedding IS NOT NULL
ORDER BY similarity DESC
LIMIT 5;
Cleanup (Optional)
DROP TABLE IF EXISTS paper_citations;
DROP TABLE IF EXISTS paper_authors;
DROP TABLE IF EXISTS research_papers;
Expected Outcomes
- Papers stored with metadata
- Authors tracked per paper
- Citations linked
- Semantic search works
- Related papers discovered
Publication Types
| Type | Description |
|---|---|
| journal | Peer-reviewed journal |
| conference | Conference proceeding |
| preprint | Pre-publication |
| thesis | Dissertation/thesis |
Key Concepts Learned
- Academic paper management
- Citation network tracking
- Author collaboration
- Semantic paper search
- Research discovery