Skip to content

Commit

Permalink
Input output swap (#2239)
Browse files Browse the repository at this point in the history
* All the code changes in, and most of the tests updated.

* all the tests pass

* 1 - Changing the "source" parameter name and field in the Columninfo classes, to be "sourceColumnName", as suggested.
Changing the "name" parameter to "outputColumnName" in the:
- estimator extension APIs
- estimator ctors
- column pairs expressed through tuples, because in context it reads better than name.

Note: in the columnInfo classes i left it to "name" because "outputColumnName" makes no sense.

2 - Nit on standartizing the XML comments.
3 - Arranging the order of the parameters to be: outputColumnName, required parameters, nullable sourceColumnName.
  • Loading branch information
sfilipi authored Jan 29, 2019
1 parent 22ea6d1 commit e383091
Show file tree
Hide file tree
Showing 171 changed files with 2,216 additions and 2,138 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,14 @@ public static void FeatureSelectionTransform()
// In this example we define a CountFeatureSelectingEstimator, that selects slots in a feature vector that have more non-default
// values than the specified count. This transformation can be used to remove slots with too many missing values.
var countSelectEst = ml.Transforms.FeatureSelection.SelectFeaturesBasedOnCount(
inputColumn: "Features", outputColumn: "FeaturesCountSelect", count: 695);
outputColumnName: "FeaturesCountSelect", inputColumnName: "Features", count: 695);

// We also define a MutualInformationFeatureSelectingEstimator that selects the top k slots in a feature
// vector based on highest mutual information between that slot and a specified label. Notice that it is possible to
// specify the parameter `numBins', which controls the number of bins used in the approximation of the mutual information
// between features and label.
var mutualInfoEst = ml.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation(
inputColumn: "FeaturesCountSelect", outputColumn: "FeaturesMISelect", labelColumn: "Label", slotsInOutput: 5);
outputColumnName: "FeaturesMISelect", inputColumnName: "FeaturesCountSelect", labelColumn: "Label", slotsInOutput: 5);

// Now, we can put the previous two transformations together in a pipeline.
var pipeline = countSelectEst.Append(mutualInfoEst);
Expand Down
4 changes: 2 additions & 2 deletions docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,15 @@ public static void KeyToValue_Term()
string defaultColumnName = "DefaultKeys";
// REVIEW create through the catalog extension
var default_pipeline = new WordTokenizingEstimator(ml, "Review")
.Append(new ValueToKeyMappingEstimator(ml, "Review", defaultColumnName));
.Append(new ValueToKeyMappingEstimator(ml, defaultColumnName, "Review"));

// Another pipeline, that customizes the advanced settings of the TermEstimator.
// We can change the maxNumTerm to limit how many keys will get generated out of the set of words,
// and condition the order in which they get evaluated by changing sort from the default Occurence (order in which they get encountered)
// to value/alphabetically.
string customizedColumnName = "CustomizedKeys";
var customized_pipeline = new WordTokenizingEstimator(ml, "Review")
.Append(new ValueToKeyMappingEstimator(ml, "Review", customizedColumnName, maxNumTerms: 10, sort: ValueToKeyMappingTransformer.SortOrder.Value));
.Append(new ValueToKeyMappingEstimator(ml,customizedColumnName, "Review", maxNumTerms: 10, sort: ValueToKeyMappingTransformer.SortOrder.Value));

// The transformed data.
var transformedData_default = default_pipeline.Fit(trainData).Transform(trainData);
Expand Down
6 changes: 3 additions & 3 deletions docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ public static void NgramTransform()
// A pipeline to tokenize text as characters and then combine them together into ngrams
// The pipeline uses the default settings to featurize.

var charsPipeline = ml.Transforms.Text.TokenizeCharacters("SentimentText", "Chars", useMarkerCharacters:false);
var ngramOnePipeline = ml.Transforms.Text.ProduceNgrams("Chars", "CharsUnigrams", ngramLength:1);
var ngramTwpPipeline = ml.Transforms.Text.ProduceNgrams("Chars", "CharsTwograms");
var charsPipeline = ml.Transforms.Text.TokenizeCharacters("Chars", "SentimentText", useMarkerCharacters:false);
var ngramOnePipeline = ml.Transforms.Text.ProduceNgrams("CharsUnigrams", "Chars", ngramLength:1);
var ngramTwpPipeline = ml.Transforms.Text.ProduceNgrams("CharsTwograms", "Chars");
var oneCharsPipeline = charsPipeline.Append(ngramOnePipeline);
var twoCharsPipeline = charsPipeline.Append(ngramTwpPipeline);

Expand Down
6 changes: 3 additions & 3 deletions docs/samples/Microsoft.ML.Samples/Dynamic/Normalizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public static void Normalizer()
var transformer = pipeline.Fit(trainData);

var modelParams = transformer.Columns
.First(x => x.Output == "Induced")
.First(x => x.Name == "Induced")
.ModelParameters as NormalizingTransformer.AffineNormalizerModelParameters<float>;

Console.WriteLine($"The normalization parameters are: Scale = {modelParams.Scale} and Offset = {modelParams.Offset}");
Expand Down Expand Up @@ -66,7 +66,7 @@ public static void Normalizer()

// Composing a different pipeline if we wanted to normalize more than one column at a time.
// Using log scale as the normalization mode.
var multiColPipeline = ml.Transforms.Normalize(NormalizingEstimator.NormalizerMode.LogMeanVariance, new[] { ("Induced", "LogInduced"), ("Spontaneous", "LogSpontaneous") });
var multiColPipeline = ml.Transforms.Normalize(NormalizingEstimator.NormalizerMode.LogMeanVariance, new[] { ("LogInduced", "Induced"), ("LogSpontaneous", "Spontaneous") });
// The transformed data.
var multiColtransformer = multiColPipeline.Fit(trainData);
var multiColtransformedData = multiColtransformer.Transform(trainData);
Expand Down Expand Up @@ -97,7 +97,7 @@ public static void Normalizer()

// Inspect the weights of normalizing the columns
var multiColModelParams = multiColtransformer.Columns
.First(x=> x.Output == "LogInduced")
.First(x=> x.Name == "LogInduced")
.ModelParameters as NormalizingTransformer.CdfNormalizerModelParameters<float>;

Console.WriteLine($"The normalization parameters are: Mean = {multiColModelParams.Mean} and Stddev = {multiColModelParams.Stddev}");
Expand Down
2 changes: 1 addition & 1 deletion docs/samples/Microsoft.ML.Samples/Dynamic/OnnxTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ public static void OnnxTransformSample()
var mlContext = new MLContext();
var data = GetTensorData();
var idv = mlContext.Data.ReadFromEnumerable(data);
var pipeline = new OnnxScoringEstimator(mlContext, modelPath, new[] { inputInfo.Key }, new[] { outputInfo.Key });
var pipeline = new OnnxScoringEstimator(mlContext, new[] { outputInfo.Key }, new[] { inputInfo.Key }, modelPath);

// Run the pipeline and get the transformed values
var transformedValues = pipeline.Fit(idv).Transform(idv);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ public static void TensorFlowScoringSample()
// Create a ML pipeline.
var pipeline = mlContext.Transforms.ScoreTensorFlowModel(
modelLocation,
new[] { nameof(TensorData.input) },
new[] { nameof(OutputScores.output) });
new[] { nameof(OutputScores.output) },
new[] { nameof(TensorData.input) });

// Run the pipeline and get the transformed values.
var estimator = pipeline.Fit(idv);
Expand Down
4 changes: 2 additions & 2 deletions docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ public static void TextTransform()
// A pipeline for featurization of the "SentimentText" column, and placing the output in a new column named "DefaultTextFeatures"
// The pipeline uses the default settings to featurize.
string defaultColumnName = "DefaultTextFeatures";
var default_pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", defaultColumnName);
var default_pipeline = ml.Transforms.Text.FeaturizeText(defaultColumnName , "SentimentText");

// Another pipeline, that customizes the advanced settings of the FeaturizeText transformer.
string customizedColumnName = "CustomizedTextFeatures";
var customized_pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", customizedColumnName, s =>
var customized_pipeline = ml.Transforms.Text.FeaturizeText(customizedColumnName, "SentimentText", s =>
{
s.KeepPunctuations = false;
s.KeepNumbers = false;
Expand Down
18 changes: 9 additions & 9 deletions src/Microsoft.ML.Core/Data/IEstimator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -75,29 +75,29 @@ internal Column(string name, VectorKind vecKind, ColumnType itemType, bool isKey
}

/// <summary>
/// Returns whether <paramref name="inputColumn"/> is a valid input, if this object represents a
/// Returns whether <paramref name="source"/> is a valid input, if this object represents a
/// requirement.
///
/// Namely, it returns true iff:
/// - The <see cref="Name"/>, <see cref="Kind"/>, <see cref="ItemType"/>, <see cref="IsKey"/> fields match.
/// - The columns of <see cref="Metadata"/> of <paramref name="inputColumn"/> is a superset of our <see cref="Metadata"/> columns.
/// - The columns of <see cref="Metadata"/> of <paramref name="source"/> is a superset of our <see cref="Metadata"/> columns.
/// - Each such metadata column is itself compatible with the input metadata column.
/// </summary>
[BestFriend]
internal bool IsCompatibleWith(Column inputColumn)
internal bool IsCompatibleWith(Column source)
{
Contracts.Check(inputColumn.IsValid, nameof(inputColumn));
if (Name != inputColumn.Name)
Contracts.Check(source.IsValid, nameof(source));
if (Name != source.Name)
return false;
if (Kind != inputColumn.Kind)
if (Kind != source.Kind)
return false;
if (!ItemType.Equals(inputColumn.ItemType))
if (!ItemType.Equals(source.ItemType))
return false;
if (IsKey != inputColumn.IsKey)
if (IsKey != source.IsKey)
return false;
foreach (var metaCol in Metadata)
{
if (!inputColumn.Metadata.TryFindColumn(metaCol.Name, out var inputMetaCol))
if (!source.Metadata.TryFindColumn(metaCol.Name, out var inputMetaCol))
return false;
if (!metaCol.IsCompatibleWith(inputMetaCol))
return false;
Expand Down
8 changes: 4 additions & 4 deletions src/Microsoft.ML.Data/Evaluators/AnomalyDetectionEvaluator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -708,11 +708,11 @@ private protected override void PrintFoldResultsCore(IChannel ch, Dictionary<str
var pFormatName = string.Format(FoldDrAtPFormat, _p);
var numAnomName = string.Format(FoldDrAtNumAnomaliesFormat, numAnomalies);

(string Source, string Name)[] cols =
(string name, string source)[] cols =
{
(AnomalyDetectionEvaluator.OverallMetrics.DrAtK, kFormatName),
(AnomalyDetectionEvaluator.OverallMetrics.DrAtPFpr, pFormatName),
(AnomalyDetectionEvaluator.OverallMetrics.DrAtNumPos, numAnomName)
(kFormatName, AnomalyDetectionEvaluator.OverallMetrics.DrAtK),
(pFormatName, AnomalyDetectionEvaluator.OverallMetrics.DrAtPFpr),
(numAnomName, AnomalyDetectionEvaluator.OverallMetrics.DrAtNumPos)
};

// List of columns to keep, note that the order specified determines the order of the output
Expand Down
8 changes: 4 additions & 4 deletions src/Microsoft.ML.Data/Evaluators/BinaryClassifierEvaluator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1198,11 +1198,11 @@ private protected override void PrintFoldResultsCore(IChannel ch, Dictionary<str
if (!metrics.TryGetValue(MetricKinds.ConfusionMatrix, out conf))
throw ch.Except("No overall metrics found");

(string Source, string Name)[] cols =
(string name, string source)[] cols =
{
(BinaryClassifierEvaluator.Accuracy, FoldAccuracy),
(BinaryClassifierEvaluator.LogLoss, FoldLogLoss),
(BinaryClassifierEvaluator.LogLossReduction, FoldLogLosRed)
(FoldAccuracy, BinaryClassifierEvaluator.Accuracy),
(FoldLogLoss, BinaryClassifierEvaluator.LogLoss),
(FoldLogLosRed, BinaryClassifierEvaluator.LogLossReduction)
};

var colsToKeep = new List<string>();
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/Evaluators/MamlEvaluator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ private IDataView WrapPerInstance(RoleMappedData perInst)

// Make a list of column names that Maml outputs as part of the per-instance data view, and then wrap
// the per-instance data computed by the evaluator in a SelectColumnsTransform.
var cols = new List<(string Source, string Name)>();
var cols = new List<(string name, string source)>();
var colsToKeep = new List<string>();

// If perInst is the result of cross-validation and contains a fold Id column, include it.
Expand All @@ -241,7 +241,7 @@ private IDataView WrapPerInstance(RoleMappedData perInst)
// Maml always outputs a name column, if it doesn't exist add a GenerateNumberTransform.
if (perInst.Schema.Name?.Name is string nameName)
{
cols.Add((nameName, "Instance"));
cols.Add(("Instance", nameName));
colsToKeep.Add("Instance");
}
else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -950,7 +950,7 @@ private protected override IDataView GetOverallResultsCore(IDataView overall)

private IDataView ChangeTopKAccColumnName(IDataView input)
{
input = new ColumnCopyingTransformer(Host, (MultiClassClassifierEvaluator.TopKAccuracy, string.Format(TopKAccuracyFormat, _outputTopKAcc))).Transform(input);
input = new ColumnCopyingTransformer(Host, (string.Format(TopKAccuracyFormat, _outputTopKAcc), MultiClassClassifierEvaluator.TopKAccuracy)).Transform(input);
return ColumnSelectingTransformer.CreateDrop(Host, input, MultiClassClassifierEvaluator.TopKAccuracy);
}

Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/TrainCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,9 @@ private void EnsureStratificationColumn(ref IDataView data, ref string stratific
stratificationColumn = string.Format("{0}_{1:000}", origStratCol, ++inc);
HashingTransformer.ColumnInfo columnInfo;
if (seed.HasValue)
columnInfo = new HashingTransformer.ColumnInfo(origStratCol, stratificationColumn, 30, seed.Value);
columnInfo = new HashingTransformer.ColumnInfo(stratificationColumn, origStratCol, 30, seed.Value);
else
columnInfo = new HashingTransformer.ColumnInfo(origStratCol, stratificationColumn, 30);
columnInfo = new HashingTransformer.ColumnInfo(stratificationColumn, origStratCol, 30);
data = new HashingEstimator(Host, columnInfo).Fit(data).Transform(data);
}
}
Expand Down
16 changes: 8 additions & 8 deletions src/Microsoft.ML.Data/Transforms/ColumnConcatenatingEstimator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,20 @@ public sealed class ColumnConcatenatingEstimator : IEstimator<ITransformer>
/// Initializes a new instance of <see cref="ColumnConcatenatingEstimator"/>
/// </summary>
/// <param name="env">The local instance of <see cref="IHostEnvironment"/>.</param>
/// <param name="outputColumn">The name of the resulting column.</param>
/// <param name="inputColumns">The columns to concatenate together.</param>
public ColumnConcatenatingEstimator (IHostEnvironment env, string outputColumn, params string[] inputColumns)
/// <param name="outputColumnName">The name of the resulting column.</param>
/// <param name="inputColumnNames">The columns to concatenate together.</param>
public ColumnConcatenatingEstimator(IHostEnvironment env, string outputColumnName, params string[] inputColumnNames)
{
Contracts.CheckValue(env, nameof(env));
_host = env.Register("ColumnConcatenatingEstimator ");

_host.CheckNonEmpty(outputColumn, nameof(outputColumn));
_host.CheckValue(inputColumns, nameof(inputColumns));
_host.CheckParam(!inputColumns.Any(r => string.IsNullOrEmpty(r)), nameof(inputColumns),
_host.CheckNonEmpty(outputColumnName, nameof(outputColumnName));
_host.CheckValue(inputColumnNames, nameof(inputColumnNames));
_host.CheckParam(!inputColumnNames.Any(r => string.IsNullOrEmpty(r)), nameof(inputColumnNames),
"Contained some null or empty items");

_name = outputColumn;
_source = inputColumns;
_name = outputColumnName;
_source = inputColumnNames;
}

public ITransformer Fit(IDataView input)
Expand Down
Loading

0 comments on commit e383091

Please sign in to comment.