Drug Discovery Candidate Selection

Score compounds for likelihood of success. Prioritize top candidates for lab testing.

All recipes· ml· 10 minutesadvancedsql

Drug Discovery Candidate Selection

Objective

Score compounds for likelihood of success. Prioritize top candidates for lab testing.

Step 1: Create tables and load data

CREATE TABLE compound_properties (
  compound_id BIGINT PRIMARY KEY,
  molecular_weight DOUBLE,
  hydrogen_bonds INT,
  logP DOUBLE,                  -- lipid solubility
  toxicity_score DOUBLE,        -- lower is better
  binding_affinity DOUBLE,      -- higher magnitude = stronger binding
  target_label INT              -- 1 promising, 0 fail (null = unknown)
);

INSERT INTO compound_properties (compound_id, molecular_weight, hydrogen_bonds, logP, toxicity_score, binding_affinity, target_label ) VALUES
(1001, 350.2, 3, 2.1, 0.12, -8.7, 1),
(1002, 510.8, 1, 4.3, 0.45, -6.1, 0),
(1003, 420.0, 2, 3.2, 0.18, -9.0, 1),
(1004, 295.5, 4, 1.9, 0.10, -7.4, 1),
(1005, 560.1, 0, 5.0, 0.60, -5.2, 0),
(1006, 380.4, 3, 2.8, 0.22, -7.9, 1),
(1007, 445.2, 2, 3.5, 0.31, -6.8, 0),
(1008, 312.7, 4, 1.5, 0.15, -8.2, 1),
(1009, 489.9, 1, 4.1, 0.38, -6.4, 0),
(1010, 367.3, 3, 2.4, 0.20, -8.5, 1),
(1011, 523.5, 0, 4.8, 0.52, -5.5, 0),
(1012, 401.8, 2, 3.0, 0.25, -7.7, NULL),
(1013, 338.9, 4, 1.7, 0.13, -8.8, 1),
(1014, 475.6, 1, 3.9, 0.41, -6.0, 0),
(1015, 392.1, 3, 2.6, 0.19, -8.1, 1),
(1016, 455.8, 2, 3.7, 0.35, -6.6, 0),
(1017, 325.4, 4, 1.8, 0.11, -9.2, 1),
(1018, 508.2, 1, 4.4, 0.47, -5.8, 0),
(1019, 372.9, 3, 2.3, 0.17, -8.3, NULL),
(1020, 441.5, 2, 3.4, 0.29, -7.0, 0),
(1021, 298.8, 5, 1.2, 0.08, -9.5, 1),
(1022, 516.7, 0, 4.9, 0.55, -5.3, 0),
(1023, 385.3, 3, 2.7, 0.21, -7.8, 1),
(1024, 463.1, 2, 3.8, 0.37, -6.3, 0),
(1025, 349.6, 4, 2.0, 0.14, -8.6, 1),
(1026, 497.4, 1, 4.2, 0.44, -5.9, NULL),
(1027, 410.2, 2, 3.1, 0.26, -7.5, 1),
(1028, 358.7, 3, 2.5, 0.16, -8.4, 1),
(1029, 481.9, 1, 4.0, 0.39, -6.2, 0),
(1030, 426.5, 2, 3.3, 0.28, -7.2, NULL),
(1031, 305.1, 5, 1.4, 0.09, -9.3, 1),
(1032, 532.8, 0, 5.1, 0.58, -5.1, 0),
(1033, 397.7, 3, 2.9, 0.24, -7.6, 1),
(1034, 470.3, 1, 3.6, 0.33, -6.7, 0),
(1035, 344.2, 4, 1.6, 0.12, -8.9, 1),
(1036, 418.9, 2, 3.2, 0.27, -7.3, NULL),
(1037, 502.1, 1, 4.5, 0.48, -5.7, 0),
(1038, 361.5, 3, 2.2, 0.18, -8.0, 1),
(1039, 436.8, 2, 3.5, 0.32, -6.9, 0),
(1040, 319.3, 4, 1.9, 0.10, -9.1, 1),
(1041, 486.6, 1, 4.3, 0.42, -6.1, 0),
(1042, 376.1, 3, 2.8, 0.23, -7.9, NULL),
(1043, 449.7, 2, 3.7, 0.36, -6.5, 0),
(1044, 333.8, 4, 1.5, 0.11, -8.7, 1),
(1045, 513.4, 0, 4.7, 0.51, -5.4, 0),
(1046, 405.5, 2, 3.0, 0.25, -7.4, 1),
(1047, 352.0, 3, 2.4, 0.15, -8.5, NULL),
(1048, 477.2, 1, 3.9, 0.40, -6.0, 0),
(1049, 390.6, 3, 2.6, 0.20, -8.1, 1),
(1050, 458.5, 2, 3.8, 0.34, -6.6, 0),
(1051, 322.7, 5, 1.3, 0.07, -9.6, 1),
(1052, 521.9, 0, 5.0, 0.56, -5.2, 0),
(1053, 383.4, 3, 2.7, 0.19, -8.2, 1),
(1054, 466.9, 1, 4.1, 0.38, -6.3, NULL),
(1055, 347.1, 4, 1.8, 0.13, -8.8, 1),
(1056, 494.3, 1, 4.4, 0.45, -5.8, 0),
(1057, 413.8, 2, 3.1, 0.26, -7.5, 1),
(1058, 356.4, 3, 2.5, 0.17, -8.3, NULL),
(1059, 429.2, 2, 3.3, 0.30, -7.1, 0),
(1060, 308.5, 5, 1.1, 0.08, -9.4, 1),
(1061, 535.7, 0, 5.2, 0.59, -5.0, 0),
(1062, 399.9, 3, 2.9, 0.24, -7.7, 1),
(1063, 473.6, 1, 3.6, 0.35, -6.8, 0),
(1064, 341.3, 4, 1.7, 0.12, -8.9, NULL),
(1065, 415.7, 2, 3.2, 0.28, -7.3, 1),
(1066, 505.8, 1, 4.5, 0.49, -5.6, 0),
(1067, 364.2, 3, 2.3, 0.16, -8.4, 1),
(1068, 439.5, 2, 3.4, 0.31, -6.9, NULL),
(1069, 316.6, 4, 1.6, 0.09, -9.2, 1),
(1070, 492.1, 1, 4.2, 0.43, -6.0, 0),
(1071, 378.8, 3, 2.8, 0.22, -7.8, 1),
(1072, 452.3, 2, 3.7, 0.37, -6.4, 0),
(1073, 336.1, 4, 1.9, 0.11, -8.6, 1),
(1074, 510.5, 0, 4.8, 0.53, -5.3, NULL),
(1075, 408.4, 2, 3.0, 0.25, -7.6, 1),
(1076, 354.9, 3, 2.2, 0.14, -8.5, 1),
(1077, 480.7, 1, 4.0, 0.41, -6.2, 0),
(1078, 393.2, 3, 2.6, 0.21, -8.0, NULL),
(1079, 461.4, 2, 3.8, 0.36, -6.5, 0),
(1080, 327.8, 5, 1.4, 0.08, -9.3, 1),
(1081, 525.1, 0, 4.9, 0.54, -5.2, 0),
(1082, 387.6, 3, 2.7, 0.20, -8.1, 1),
(1083, 468.2, 1, 3.9, 0.39, -6.3, 0),
(1084, 350.5, 4, 2.0, 0.14, -8.7, NULL),
(1085, 497.9, 1, 4.3, 0.46, -5.7, 0),
(1086, 411.3, 2, 3.1, 0.27, -7.4, 1),
(1087, 359.8, 3, 2.5, 0.18, -8.3, 1),
(1088, 433.6, 2, 3.5, 0.32, -7.0, NULL),
(1089, 312.2, 5, 1.2, 0.07, -9.5, 1),
(1090, 538.3, 0, 5.1, 0.60, -4.9, 0),
(1091, 402.7, 3, 2.9, 0.23, -7.7, 1),
(1092, 476.4, 1, 3.7, 0.34, -6.7, 0),
(1093, 345.0, 4, 1.8, 0.13, -8.8, 1),
(1094, 419.8, 2, 3.2, 0.29, -7.2, NULL),
(1095, 500.2, 1, 4.4, 0.47, -5.6, 0),
(1096, 366.7, 3, 2.4, 0.17, -8.2, 1),
(1097, 442.1, 2, 3.4, 0.33, -6.8, 0),
(1098, 320.4, 4, 1.7, 0.10, -9.0, 1),
(1099, 488.8, 1, 4.1, 0.42, -6.1, NULL),
(1100, 374.3, 3, 2.6, 0.21, -7.9, 1),
(1101, 448.9, 2, 3.6, 0.35, -6.6, 0),
(1102, 330.6, 4, 1.5, 0.11, -8.9, 1),
(1103, 514.2, 0, 4.7, 0.50, -5.4, 0),
(1104, 395.1, 3, 2.8, 0.22, -7.8, NULL),
(1105, 463.7, 1, 3.8, 0.38, -6.4, 0),
(1106, 342.9, 4, 1.9, 0.12, -8.7, 1),
(1107, 507.5, 1, 4.5, 0.48, -5.5, 0),
(1108, 381.5, 3, 2.3, 0.19, -8.0, 1),
(1109, 456.2, 2, 3.7, 0.36, -6.5, NULL),
(1110, 324.8, 5, 1.3, 0.09, -9.4, 1);

Step 2: Create experiment

CREATE EXPERIMENT drug_discovery_rf AS
SELECT
  molecular_weight,
  hydrogen_bonds,
  logP,
  toxicity_score,
  binding_affinity,
  target_label AS target
FROM compound_properties
WHERE target_label IS NOT NULL
WITH (
  task_type = 'classification',
  target_column = 'target',
  algorithms = ['random_forest'],
  optimization_metric = 'roc_auc',
  validation_strategy = 'kfold',
  n_folds = 10,
  class_weight = 'balanced',
  algorithm_params = {'n_estimators': 500, 'max_depth': 12, 'min_samples_leaf': 3}
);

Step 3: Deploy best model

DEPLOY MODEL compound_model FROM EXPERIMENT drug_discovery_rf;

Step 4: Score unlabeled compounds

-- score all with unknown labels
PREDICT success_probability USING compound_model AS
SELECT
  compound_id,
  molecular_weight,
  hydrogen_bonds,
  logP,
  toxicity_score,
  binding_affinity
FROM compound_properties
WHERE target_label IS NULL;

Step 5: Rank and threshold

-- keep top 5% for lab follow-up
SELECT *
FROM (
  PREDICT success_probability USING compound_model AS
  SELECT compound_id, molecular_weight, hydrogen_bonds, logP, toxicity_score, binding_affinity
  FROM compound_properties
  WHERE target_label IS NULL
) s
ORDER BY success_probability DESC
LIMIT (SELECT CEIL(COUNT(*) * 0.05) FROM compound_properties WHERE target_label IS NULL);

Optional: Validation

-- out-of-fold metrics
SELECT roc_auc, pr_auc, accuracy, precision, recall, f1
FROM automl_experiments
WHERE name = 'drug_discovery_rf';

Expected outcomes

  • Lift in hit rate for wet-lab tests.
  • Fewer late-stage failures.
  • Feature patterns that align with known ADMET wisdom.

Tags

mlsqlregressionlife-sciencesscoring

Run this on your own machine

Install SynapCores Community Edition free, paste the SQL or Cypher above into the bundled web UI, and watch it run.

Download Free CE