Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding support for training metrics in PipelineSweeperMacro + new graph variable outputs #152

Merged
merged 15 commits into from
May 23, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions src/Microsoft.ML.PipelineInference/AutoInference.cs
Original file line number Diff line number Diff line change
Expand Up @@ -172,12 +172,14 @@ private bool GetDataVariableName(IExceptionContext ectx, string nameOfData, JTok
public sealed class RunSummary
{
public double MetricValue { get; }
public double TrainingMetricValue { get; }
public int NumRowsInTraining { get; }
public long RunTimeMilliseconds { get; }

public RunSummary(double metricValue, int numRows, long runTimeMilliseconds)
public RunSummary(double metricValue, int numRows, long runTimeMilliseconds, double trainingMetricValue)
{
MetricValue = metricValue;
TrainingMetricValue = trainingMetricValue;
NumRowsInTraining = numRows;
RunTimeMilliseconds = runTimeMilliseconds;
}
Expand Down Expand Up @@ -303,7 +305,7 @@ private void MainLearningLoop(int batchSize, int numOfTrainingRows)
var stopwatch = new Stopwatch();
var probabilityUtils = new Sweeper.Algorithms.SweeperProbabilityUtils(_host);

while (!_terminator.ShouldTerminate(_history))
while (!_terminator.ShouldTerminate(_history))
{
// Get next set of candidates
var currentBatchSize = batchSize;
Expand Down Expand Up @@ -341,16 +343,17 @@ private void ProcessPipeline(Sweeper.Algorithms.SweeperProbabilityUtils utils, S

// Run pipeline, and time how long it takes
stopwatch.Restart();
double d = candidate.RunTrainTestExperiment(_trainData.Take(randomizedNumberOfRows),
_testData, Metric, TrainerKind);
candidate.RunTrainTestExperiment(_trainData.Take(randomizedNumberOfRows),
_testData, Metric, TrainerKind, out var testMetricVal, out var trainMetricVal);
stopwatch.Stop();

// Handle key collisions on sorted list
while (_sortedSampledElements.ContainsKey(d))
d += 1e-10;
while (_sortedSampledElements.ContainsKey(testMetricVal))
testMetricVal += 1e-10;

// Save performance score
candidate.PerformanceSummary = new RunSummary(d, randomizedNumberOfRows, stopwatch.ElapsedMilliseconds);
candidate.PerformanceSummary =
new RunSummary(testMetricVal, randomizedNumberOfRows, stopwatch.ElapsedMilliseconds, trainMetricVal);
_sortedSampledElements.Add(candidate.PerformanceSummary.MetricValue, candidate);
_history.Add(candidate);
}
Expand Down
24 changes: 20 additions & 4 deletions src/Microsoft.ML.PipelineInference/AutoMlUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,35 @@ namespace Microsoft.ML.Runtime.PipelineInference
{
public static class AutoMlUtils
{
public static AutoInference.RunSummary ExtractRunSummary(IHostEnvironment env, IDataView data, string metricColumnName)
public static AutoInference.RunSummary ExtractRunSummary(IHostEnvironment env, IDataView result, string metricColumnName, IDataView trainResult = null)
{
double metricValue = 0;
double trainingMetricValue = -1d;
int numRows = 0;
var schema = data.Schema;
var schema = result.Schema;
schema.TryGetColumnIndex(metricColumnName, out var metricCol);

using (var cursor = data.GetRowCursor(col => col == metricCol))
using (var cursor = result.GetRowCursor(col => col == metricCol))
{
var getter = cursor.GetGetter<double>(metricCol);
cursor.MoveNext();
getter(ref metricValue);
}

return new AutoInference.RunSummary(metricValue, numRows, 0);
if (trainResult != null)
{
var trainSchema = trainResult.Schema;
trainSchema.TryGetColumnIndex(metricColumnName, out var trainingMetricCol);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

trainSchema.TryGetColumnIndex(metricColumnName, out var trainingMetricCol); [](start = 16, length = 75)

I'm wondering if it's possible to tighten up this method a bit, in terms of its handling of inputs other than "happy path" inputs.

To give an example: if metricColumnName is not a column in trainResult, then TryGetColumnIndex will return false. However, we merely assume it succeeds, but trainingMetricCol will hold some undertermined value -- I guess default(int). And it will happily extract that (assuming it was of type double).

Since these a public methods on a public class these would be env.Check* style checks. (If it were non-public though we'd still prefer to have at least asserts.)

Ideally we'd also make the other methods in this a bit more robust, but perhaps that's a bit beyond the scope of the PR. Might as well start here though.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. I'll make the corresponding changes.


using (var cursor = trainResult.GetRowCursor(col => col == trainingMetricCol))
{
var getter = cursor.GetGetter<double>(trainingMetricCol);
cursor.MoveNext();
getter(ref trainingMetricValue);
}
}

return new AutoInference.RunSummary(metricValue, numRows, 0, trainingMetricValue);
}

public static CommonInputs.IEvaluatorInput CloneEvaluatorInstance(CommonInputs.IEvaluatorInput evalInput) =>
Expand Down Expand Up @@ -618,5 +632,7 @@ public static Tuple<string, string[]>[] ConvertToSweepArgumentStrings(TlcModule.
}
return results;
}

public static string GenerateOverallTrainingMetricVarName(Guid id) => $"Var_Training_OM_{id:N}";
}
}
19 changes: 14 additions & 5 deletions src/Microsoft.ML.PipelineInference/Macros/PipelineSweeperMacro.cs
Original file line number Diff line number Diff line change
Expand Up @@ -65,18 +65,24 @@ public static Output ExtractSweepResult(IHostEnvironment env, ResultInput input)
var col1 = new KeyValuePair<string, ColumnType>("Graph", TextType.Instance);
var col2 = new KeyValuePair<string, ColumnType>("MetricValue", PrimitiveType.FromKind(DataKind.R8));
var col3 = new KeyValuePair<string, ColumnType>("PipelineId", TextType.Instance);
var col4 = new KeyValuePair<string, ColumnType>("TrainingMetricValue", PrimitiveType.FromKind(DataKind.R8));
var col5 = new KeyValuePair<string, ColumnType>("FirstInput", TextType.Instance);
var col6 = new KeyValuePair<string, ColumnType>("PredictorModel", TextType.Instance);

if (rows.Count == 0)
{
var host = env.Register("ExtractSweepResult");
outputView = new EmptyDataView(host, new SimpleSchema(host, col1, col2, col3));
outputView = new EmptyDataView(host, new SimpleSchema(host, col1, col2, col3, col4, col5, col6));
}
else
{
var builder = new ArrayDataViewBuilder(env);
builder.AddColumn(col1.Key, (PrimitiveType)col1.Value, rows.Select(r => new DvText(r.GraphJson)).ToArray());
builder.AddColumn(col2.Key, (PrimitiveType)col2.Value, rows.Select(r => r.MetricValue).ToArray());
builder.AddColumn(col3.Key, (PrimitiveType)col3.Value, rows.Select(r => new DvText(r.PipelineId)).ToArray());
builder.AddColumn(col4.Key, (PrimitiveType)col4.Value, rows.Select(r => r.TrainingMetricValue).ToArray());
builder.AddColumn(col5.Key, (PrimitiveType)col5.Value, rows.Select(r => new DvText(r.FirstInput)).ToArray());
builder.AddColumn(col6.Key, (PrimitiveType)col6.Value, rows.Select(r => new DvText(r.PredictorModel)).ToArray());
outputView = builder.GetDataView();
}
return new Output { Results = outputView, State = autoMlState };
Expand Down Expand Up @@ -132,11 +138,11 @@ public static CommonOutputs.MacroOutput<Output> PipelineSweep(
// Extract performance summaries and assign to previous candidate pipelines.
foreach (var pipeline in autoMlState.BatchCandidates)
{
if (node.Context.TryGetVariable(ExperimentUtils.GenerateOverallMetricVarName(pipeline.UniqueId),
out var v))
if (node.Context.TryGetVariable(ExperimentUtils.GenerateOverallMetricVarName(pipeline.UniqueId), out var v) &&
node.Context.TryGetVariable(AutoMlUtils.GenerateOverallTrainingMetricVarName(pipeline.UniqueId), out var v2))
{
pipeline.PerformanceSummary =
AutoMlUtils.ExtractRunSummary(env, (IDataView)v.Value, autoMlState.Metric.Name);
AutoMlUtils.ExtractRunSummary(env, (IDataView)v.Value, autoMlState.Metric.Name, (IDataView)v2.Value);
autoMlState.AddEvaluated(pipeline);
}
}
Expand Down Expand Up @@ -168,14 +174,17 @@ public static CommonOutputs.MacroOutput<Output> PipelineSweep(
{
// Add train test experiments to current graph for candidate pipeline
var subgraph = new Experiment(env);
var trainTestOutput = p.AddAsTrainTest(training, testing, autoMlState.TrainerKind, subgraph);
var trainTestOutput = p.AddAsTrainTest(training, testing, autoMlState.TrainerKind, subgraph, true);

// Change variable name to reference pipeline ID in output map, context and entrypoint output.
var uniqueName = ExperimentUtils.GenerateOverallMetricVarName(p.UniqueId);
var uniqueNameTraining = AutoMlUtils.GenerateOverallTrainingMetricVarName(p.UniqueId);
var sgNode = EntryPointNode.ValidateNodes(env, node.Context,
new JArray(subgraph.GetNodes().Last()), node.Catalog).Last();
sgNode.RenameOutputVariable(trainTestOutput.OverallMetrics.VarName, uniqueName, cascadeChanges: true);
sgNode.RenameOutputVariable(trainTestOutput.TrainingOverallMetrics.VarName, uniqueNameTraining, cascadeChanges: true);
trainTestOutput.OverallMetrics.VarName = uniqueName;
trainTestOutput.TrainingOverallMetrics.VarName = uniqueNameTraining;
expNodes.Add(sgNode);

// Store indicators, to pass to next iteration of macro.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
<ProjectReference Include="..\Microsoft.ML.Core\Microsoft.ML.Core.csproj" />
<ProjectReference Include="..\Microsoft.ML.StandardLearners\Microsoft.ML.StandardLearners.csproj" />
<ProjectReference Include="..\Microsoft.ML.Sweeper\Microsoft.ML.Sweeper.csproj" />
<ProjectReference Include="..\Microsoft.ML\Microsoft.ML.csproj" />
</ItemGroup>

</Project>
81 changes: 67 additions & 14 deletions src/Microsoft.ML.PipelineInference/PipelinePattern.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,24 @@ public sealed class PipelineResultRow
{
public string GraphJson { get; }
public double MetricValue { get; }
public double TrainingMetricValue { get; }
Copy link
Contributor

@Ivanidzo4ka Ivanidzo4ka May 14, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TrainingMetricValue [](start = 26, length = 19)

can you add some comments to this property. What's the difference between MetricValue and TrainingMetricValue? #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added some comments above the properties, explaining them.

Copy link
Contributor

@TomFinley TomFinley May 21, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

XML <summary> comments are slightly better, insofar that they actually show up in someone's IDE.


In reply to: 188105054 [](ancestors = 188105054)

Copy link
Contributor Author

@george-microsoft george-microsoft May 21, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll make that change. #Closed

public string PipelineId { get; }
public string FirstInput { get; }
public string PredictorModel { get; }

public PipelineResultRow()
{ }

public PipelineResultRow(string graphJson, double metricValue, string pipelineId)
public PipelineResultRow(string graphJson, double metricValue,
string pipelineId, double trainingMetricValue, string firstInput,
string predictorModel)
{
GraphJson = graphJson;
MetricValue = metricValue;
PipelineId = pipelineId;
TrainingMetricValue = trainingMetricValue;
FirstInput = firstInput;
PredictorModel = predictorModel;
}
}

Expand Down Expand Up @@ -111,7 +119,8 @@ public AutoInference.EntryPointGraphDef ToEntryPointGraph(Experiment experiment
public bool Equals(PipelinePattern obj) => obj != null && UniqueId == obj.UniqueId;

// REVIEW: We may want to allow for sweeping with CV in the future, so we will need to add new methods like this, or refactor these in that case.
public Experiment CreateTrainTestExperiment(IDataView trainData, IDataView testData, MacroUtils.TrainerKinds trainerKind, out Models.TrainTestEvaluator.Output resultsOutput)
public Experiment CreateTrainTestExperiment(IDataView trainData, IDataView testData, MacroUtils.TrainerKinds trainerKind,
bool includeTrainingMetrics, out Models.TrainTestEvaluator.Output resultsOutput)
{
var graphDef = ToEntryPointGraph();
var subGraph = graphDef.Graph;
Expand All @@ -136,7 +145,8 @@ public Experiment CreateTrainTestExperiment(IDataView trainData, IDataView testD
Model = finalOutput
},
PipelineId = UniqueId.ToString("N"),
Kind = MacroUtils.TrainerKindApiValue<Models.MacroUtilsTrainerKinds>(trainerKind)
Kind = MacroUtils.TrainerKindApiValue<Models.MacroUtilsTrainerKinds>(trainerKind),
IncludeTrainingMetrics = includeTrainingMetrics
};

var experiment = _env.CreateExperiment();
Expand All @@ -150,7 +160,7 @@ public Experiment CreateTrainTestExperiment(IDataView trainData, IDataView testD
}

public Models.TrainTestEvaluator.Output AddAsTrainTest(Var<IDataView> trainData, Var<IDataView> testData,
MacroUtils.TrainerKinds trainerKind, Experiment experiment = null)
MacroUtils.TrainerKinds trainerKind, Experiment experiment = null, bool includeTrainingMetrics = false)
{
experiment = experiment ?? _env.CreateExperiment();
var graphDef = ToEntryPointGraph(experiment);
Expand All @@ -174,7 +184,8 @@ public Models.TrainTestEvaluator.Output AddAsTrainTest(Var<IDataView> trainData,
TrainingData = trainData,
TestingData = testData,
Kind = MacroUtils.TrainerKindApiValue<Models.MacroUtilsTrainerKinds>(trainerKind),
PipelineId = UniqueId.ToString("N")
PipelineId = UniqueId.ToString("N"),
IncludeTrainingMetrics = includeTrainingMetrics
};
var trainTestOutput = experiment.Add(trainTestInput);
return trainTestOutput;
Expand All @@ -183,34 +194,58 @@ public Models.TrainTestEvaluator.Output AddAsTrainTest(Var<IDataView> trainData,
/// <summary>
/// Runs a train-test experiment on the current pipeline, through entrypoints.
/// </summary>
public double RunTrainTestExperiment(IDataView trainData, IDataView testData, AutoInference.SupportedMetric metric, MacroUtils.TrainerKinds trainerKind)
public void RunTrainTestExperiment(IDataView trainData, IDataView testData,
AutoInference.SupportedMetric metric, MacroUtils.TrainerKinds trainerKind, out double testMetricValue,
out double trainMetricValue)
{
var experiment = CreateTrainTestExperiment(trainData, testData, trainerKind, out var trainTestOutput);
var experiment = CreateTrainTestExperiment(trainData, testData, trainerKind, true, out var trainTestOutput);
experiment.Run();

var dataOut = experiment.GetOutput(trainTestOutput.OverallMetrics);
var schema = dataOut.Schema;
schema.TryGetColumnIndex(metric.Name, out var metricCol);
double metricValue = 0;
double trainingMetricValue = 0;

using (var cursor = dataOut.GetRowCursor(col => col == metricCol))
{
var getter = cursor.GetGetter<double>(metricCol);
double metricValue = 0;
cursor.MoveNext();
getter(ref metricValue);
return metricValue;
}
Copy link
Contributor

@Ivanidzo4ka Ivanidzo4ka May 21, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you use this code pattern twice here, and once in AutoMlUtils, maybe it make sense to refactor it to a method? #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good suggestion. I'll make that change.


dataOut = experiment.GetOutput(trainTestOutput.TrainingOverallMetrics);
schema = dataOut.Schema;
schema.TryGetColumnIndex(metric.Name, out metricCol);

using (var cursor = dataOut.GetRowCursor(col => col == metricCol))
{
var getter = cursor.GetGetter<double>(metricCol);
cursor.MoveNext();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cursor.MoveNext(); [](start = 16, length = 18)

We should validate that this works. We should also validate that the next call to MoveNext will return false.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Made changes, will push new version.

getter(ref trainingMetricValue);
testMetricValue = metricValue;
Copy link
Contributor

@Ivanidzo4ka Ivanidzo4ka May 21, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

testMetricValue = metricValue; [](start = 16, length = 30)

why it's part of this code block and not one where you set value for metric value? #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Training metrics are optional, so they will not always be present.

trainMetricValue = trainingMetricValue;
}
}

public static PipelineResultRow[] ExtractResults(IHostEnvironment env, IDataView data, string graphColName, string metricColName, string idColName)
public static PipelineResultRow[] ExtractResults(IHostEnvironment env, IDataView data,
string graphColName, string metricColName, string idColName, string trainingMetricColName,
string firstInputColName, string predictorModelColName)
{
var results = new List<PipelineResultRow>();
var schema = data.Schema;
if (!schema.TryGetColumnIndex(graphColName, out var graphCol))
throw env.ExceptNotSupp($"Column name {graphColName} not found");
Copy link
Contributor

@TomFinley TomFinley May 14, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ExceptNotSupp [](start = 26, length = 13)

I'm not sure "Not supported" is an appropriate exception here. ExceptParam on either data or perhaps more usefully the graphColName would be appropriate. #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will make these changes. Thanks.

if (!schema.TryGetColumnIndex(metricColName, out var metricCol))
throw env.ExceptNotSupp($"Column name {metricColName} not found");
if (!schema.TryGetColumnIndex(trainingMetricColName, out var trainingMetricCol))
throw env.ExceptNotSupp($"Column name {trainingMetricColName} not found");
if (!schema.TryGetColumnIndex(idColName, out var pipelineIdCol))
throw env.ExceptNotSupp($"Column name {idColName} not found");
if (!schema.TryGetColumnIndex(firstInputColName, out var firstInputCol))
throw env.ExceptNotSupp($"Column name {firstInputColName} not found");
if (!schema.TryGetColumnIndex(predictorModelColName, out var predictorModelCol))
throw env.ExceptNotSupp($"Column name {predictorModelColName} not found");

using (var cursor = data.GetRowCursor(col => true))
{
Expand All @@ -225,15 +260,33 @@ public static PipelineResultRow[] ExtractResults(IHostEnvironment env, IDataView
var getter3 = cursor.GetGetter<DvText>(pipelineIdCol);
DvText pipelineId = new DvText();
getter3(ref pipelineId);
results.Add(new PipelineResultRow(graphJson.ToString(), metricValue, pipelineId.ToString()));
var getter4 = cursor.GetGetter<double>(trainingMetricCol);
double trainingMetricValue = 0;
getter4(ref trainingMetricValue);
var getter5 = cursor.GetGetter<DvText>(firstInputCol);
DvText firstInput = new DvText();
getter5(ref firstInput);
var getter6 = cursor.GetGetter<DvText>(predictorModelCol);
DvText predictorModel = new DvText();
getter6(ref predictorModel);

results.Add(new PipelineResultRow(graphJson.ToString(),
metricValue, pipelineId.ToString(), trainingMetricValue,
firstInput.ToString(), predictorModel.ToString()));
}
}

return results.ToArray();
}

public PipelineResultRow ToResultRow() =>
new PipelineResultRow(ToEntryPointGraph().Graph.ToJsonString(),
PerformanceSummary?.MetricValue ?? -1d, UniqueId.ToString("N"));
public PipelineResultRow ToResultRow() {
var graphDef = ToEntryPointGraph();

return new PipelineResultRow($"{{'Nodes' : [{graphDef.Graph.ToJsonString()}]}}",
PerformanceSummary?.MetricValue ?? -1d, UniqueId.ToString("N"),
PerformanceSummary?.TrainingMetricValue ?? -1d,
graphDef.GetSubgraphFirstNodeDataVarName(_env),
graphDef.ModelOutput.VarName);
}
}
}
Loading