Hello World: Linear Regression (y = 2x + 1)
Objective
Train a simple regression model, deploy it, run predictions, then verify results match the known function.
Step 1: Create table and load data
CREATE TABLE hw_train (x DOUBLE, y DOUBLE);
INSERT INTO hw_train (x, y) VALUES
(0, 1),(1, 3),(2, 5),(3, 7),(4, 9),
(5, 11),(6, 13),(7, 15);
Step 2: Create AutoML experiment
CREATE EXPERIMENT hw_linreg AS
SELECT
x,
y AS target
FROM hw_train
WITH (
task_type = 'regression',
target_column = 'target',
algorithms = ['linear_regression','gradient_boosting'],
optimization_metric = 'rmse',
max_trials = 10,
validation_strategy = 'kfold',
n_folds = 3
);
Step 3: Deploy best model
DEPLOY MODEL hw_model FROM EXPERIMENT hw_linreg;
Step 4: Predict on new values
-- Values to predict
PREDICT y_hat USING hw_model AS
SELECT 8 as x
Step 5: Validate predictions against expected values
-- Expected y for y = 2x + 1
CREATE TABLE hw_expected (x DOUBLE, y_expected DOUBLE);
INSERT INTO hw_expected VALUES (8.0, 17.0), (12.5, 26.0);
-- Compare predictions to expectations
WITH preds AS (
PREDICT y_hat USING hw_model AS
SELECT x FROM hw_expected
)
SELECT
SUM(CASE WHEN ABS(p.y_hat - e.y_expected) < 1e-6 THEN 0 ELSE 1 END) AS mismatches
FROM preds p
JOIN hw_expected e USING (x);
Optional: Inspect experiment and trials
-- Experiment overview
SELECT * FROM automl_experiments WHERE name = 'hw_linreg';
-- Top trials by RMSE
SELECT trial_id, model_type, hyperparameters, cv_score, training_time
FROM automl_trials
WHERE experiment_id = (SELECT id FROM automl_experiments WHERE name = 'hw_linreg')
ORDER BY cv_score ASC
LIMIT 5;
Expected Outcomes
-
Best algorithm:
linear_regression. -
Coefficients: slope ≈
2.0, intercept ≈1.0. -
Quality: RMSE ≈
0on hw_train. -
Prediction examples:
- Input
x = 8.0→y_hat = 17.0. - Input
x = 12.5→y_hat = 26.0.
- Input
-
Validation query:
mismatches = 0.