Starting with SQL server 2016 we can now run our advanced statistical models right where the data lives. The increased proximity increases the efficiency of the resource intensive computations. It’s also a nice feature for developers who want to add prediction capabilities into their applications Aside - I’m using the term developer as a blanket title for data scientist, data analyst, statistician, data engineer or anyone else that might write R and SQL code. – end aside. A developer can train and persist their model in the database and then reference the trained model in a stored procedure.
Streamlined Operations
As the world changes the models we use to describe the world must change as well. The in-db analytics feature allows a streamlined platform for updating our models with the newest information about our processes. A developer can make updating models as simple as running a stored procedure.
Predicting Which Flavor of Candy I Like
A simple toy problem will be used to illustrate the feature. I’m going to ask a model to predict whether I like a piece of candy or not by passing the candy flavor as a parameter. I will provide the target value for the model to train with. The model will learn a pattern exists between the candy flavor and whether I like it or not. Spoiler alert – I only like blue raspberry. Here’s the process at a high level:
- Populate a source data table with randomly generated flavors of candy
- Train the model based on the data from step 1
- Persist the model in a binary form
- Write a store procedure referencing the model binaries
- Call the procedure, passing the candy flavor for a prediction from our model
I’m building the model with the RSNNS package. The RSNNS package is not a default package of Microsoft R, thus it will need to be installed separately. Here’s a link for installing additional packages on http://bit.ly/2uqO4hO.
--###################### Step 1 1) Populate a source data table with randomly generated flavors of candy
CREATE DATABASE classificationDemo
GO
SET NOCOUNT ON
USE classificationDemo
GO
IF OBJECT_ID('dbo.candySummary') IS NOT NULL DROP TABLE dbo.candySummary
CREATE TABLE dbo.candySummary(flavor VARCHAR(50), likeIt TINYINT)
GO
DECLARE @numInstances INT = 200,
@randomUniform DECIMAL(3,2),
@candyFlavor VARCHAR(13)
WHILE @numInstances > 0
BEGIN
SET @randomUniform = RAND()
SET @candyFlavor = CASE
WHEN @randomUniform BETWEEN 0 AND .2 THEN 'watermelon'
WHEN @randomUniform BETWEEN .21 AND .4 THEN 'grape'
WHEN @randomUniform BETWEEN .41 AND .6 THEN 'apple'
WHEN @randomUniform BETWEEN .61 AND .8 THEN 'cherry'
WHEN @randomUniform >= .81 THEN 'blueRaspberry'
END
IF @candyFlavor <> 'blueRaspberry'
INSERT INTO dbo.candySummary ( flavor , likeIt ) VALUES( @candyFlavor, 0)
ELSE
INSERT INTO dbo.candySummary ( flavor , likeIt )VALUES ( @candyFlavor , 1)
SET @numInstances = @numInstances-1
END
--Each row represents a case for picking up and observing the candy flavor attribute
--very simple simple pattern, but pattern exists
SELECT * FROM dbo.candySummary
SELECT flavor,SUM(likeIT) AS likeIT,SUM(CASE likeIt WHEN 0 THEN 1 ELSE 0 END) AS dontLikeit
FROM dbo.candySummary
GROUP BY flavor
WITH ROLLUP
-- Make sure advanced option is enabled
EXEC sp_configure 'external scripts enabled',1
Go
RECONFIGURE WITH OVERRIDE
GO
-- Requires Restart to update run value
--Need to install the RSNNS and RCPP external package, instructions here:https://docs.microsoft.com/en-us/sql/advanced-analytics/r/install-additional-r-packages-on-sql-server#1-locate-the-windows-binaries-in-zip-file-format
-- Drop packages to the location returned in script below
EXECUTE sp_execute_external_script @language = N'R'
, @script = N'OutputDataSet <- data.frame(.libPaths());'
WITH RESULT SETS (([DefaultLibraryName] VARCHAR(MAX) NOT NULL));
GO
--###################### Step 2 ) Train the model based on the data from step 1
DROP TABLE IF EXISTS dbo.model
GO
CREATE TABLE dbo.model (modelName varchar(50), modelDate DATETIME, modelBits varbinary(max))
GO
DROP PROCEDURE IF EXISTS dbo.train_candyModel
GO
CREATE PROCEDURE dbo.train_candyModel (@modelName varchar(100))
AS
BEGIN
declare @trained_model varbinary(max)
SET NOCOUNT ON
EXEC sp_execute_external_script @language =N'R',
@script=N'library(RSNNS)
my.data <- as.data.frame(my.data)
my.dataInputs <- my.data[, 1]
my.dataInputs <- decodeClassLabels(my.data[, 1])
my.dataTargets <- my.data[, 2]
my.splitData <- splitForTrainingAndTest(my.dataInputs, my.dataTargets, ratio = 0.33)
my.model <- mlp(x = my.splitData$inputsTrain, y = my.splitData$targetsTrain,
size = c(3),
maxit = 200
)
trained_model <- as.raw(serialize(my.model,NULL)) #Binary data stream
',
@input_data_1 =N'SELECT flavor ,likeIt FROM dbo.candySummary',
@input_data_1_name = N'my.data',
@params = N'@trained_model varbinary(max) OUTPUT',
@trained_model = @trained_model OUTPUT
--###################### Step 3) Persist the model in a binary form
INSERT INTO dbo.model (modelName , modelDate , modelBits )
VALUES (@modelName,Getdate(), @trained_model)
END
GO
--We can schedule this or re-train the model on demand if prediction performance starts to decline
EXECUTE dbo.train_candyModel 'CandyClassifier'
Select *
from dbo.model
--###################### Step 4) 4) Write a store procedure referencing the model binaries
DROP PROCEDURE IF EXISTS dbo.Predict_Candy_preference
GO
CREATE PROCEDURE dbo.Predict_Candy_preference (@candyFlavor varchar(20))
AS
BEGIN
SET NOCOUNT ON
DROP TABLE IF EXISTS ##flavorMap;
Create Table ##flavorMap (apple TINYINT not NULL,blueRaspberry TINYINT NOT NULL,cherry TINYINT NOT NULL, grape TINYINT NOT NULL,watermelon TINYINT NOT NULL)
IF @candyFlavor = 'apple' INSERT INTO ##flavorMap Values (1,0,0,0,0)
IF @candyFlavor = 'blueRaspberry' INSERT INTO ##flavorMap Values (0,1,0,0,0)
IF @candyFlavor = 'cherry' INSERT INTO ##flavorMap Values (0,0,1,0,0)
IF @candyFlavor = 'grape' INSERT INTO ##flavorMap Values (0,0,0,1,0)
IF @candyFlavor = 'watermelon' INSERT INTO ##flavorMap Values (0,0,0,0,1)
Declare @CCmodel varbinary(max) = (Select top 1 modelBits From dbo.model order by modelDate Desc)
EXEC sp_execute_external_script @language =N'R',
@script=N'library(RSNNS)
my.model <- unserialize(as.raw(model))
my.prediction <- predict(my.model,InputDataSet)
OutputDataSet <- as.data.frame(ifelse(my.prediction >= .9, "Yes", "No"))
',
@input_data_1 =N'SELECT * FROM ##flavorMap ',
@params = N'@model varbinary(max)',
@model = @CCmodel
WITH RESULT SETS ((likeIT Varchar(3)));
END
GO
--###################### Step 5) Call the procedure, passing the candy flavor for a prediction from our model
Execute dbo.Predict_Candy_preference 'cherry'
Execute dbo.Predict_Candy_preference 'blueRaspberry'
--My preferences changes
UPDATE dbo.candySummary
SET likeIt = 0
WHERE flavor = 'blueRaspberry'
UPDATE dbo.candySummary
SET likeIt = 1
WHERE flavor = 'cherry'
-- Re-train the model
EXECUTE dbo.train_candyModel 'CandyClassifier'
Execute dbo.Predict_Candy_preference 'blueRaspberry'
Execute dbo.Predict_Candy_preference 'cherry'
Comments