Skip to content

Commit

Permalink
Polish char- and word-level tokenizers & stopword removers (#2916)
Browse files Browse the repository at this point in the history
* Polish char-level tokenizers

* Polish word-level tokenizers

* Scrub stopword removers
  • Loading branch information
wschin authored Mar 13, 2019
1 parent fa9268d commit 91a8703
Show file tree
Hide file tree
Showing 19 changed files with 58 additions and 87 deletions.
4 changes: 2 additions & 2 deletions docs/code/MlNetCookBook.md
Original file line number Diff line number Diff line change
Expand Up @@ -775,12 +775,12 @@ var pipeline =
ngramLength: 2, useAllLengths: false))

// NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting.
.Append(mlContext.Transforms.Text.TokenizeCharacters("MessageChars", "Message"))
.Append(mlContext.Transforms.Text.ProduceCharactersAsKeys("MessageChars", "Message"))
.Append(new NgramExtractingEstimator(mlContext, "BagOfTrichar", "MessageChars",
ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf))

// NLP pipeline 4: word embeddings.
.Append(mlContext.Transforms.Text.TokenizeWords("TokenizedMessage", "NormalizedMessage"))
.Append(mlContext.Transforms.Text.ProduceWordTokens("TokenizedMessage", "NormalizedMessage"))
.Append(mlContext.Transforms.Text.ExtractWordEmbeddings("Embeddings", "TokenizedMessage",
WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ public static void Example()
// making use of default settings.
string defaultColumnName = "DefaultKeys";
// REVIEW create through the catalog extension
var default_pipeline = ml.Transforms.Text.TokenizeWords("Review")
var default_pipeline = ml.Transforms.Text.TokenizeIntoWords("Review")
.Append(ml.Transforms.Conversion.MapValueToKey(defaultColumnName, "Review"));

// Another pipeline, that customizes the advanced settings of the ValueToKeyMappingEstimator.
// We can change the maximumNumberOfKeys to limit how many keys will get generated out of the set of words,
// and condition the order in which they get evaluated by changing keyOrdinality from the default ByOccurence (order in which they get encountered)
// to value/alphabetically.
string customizedColumnName = "CustomizedKeys";
var customized_pipeline = ml.Transforms.Text.TokenizeWords("Review")
var customized_pipeline = ml.Transforms.Text.TokenizeIntoWords("Review")
.Append(ml.Transforms.Conversion.MapValueToKey(customizedColumnName, "Review", maximumNumberOfKeys: 10, keyOrdinality: ValueToKeyMappingEstimator.KeyOrdinality.ByValue));

// The transformed data.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public static void NgramTransform()
// A pipeline to tokenize text as characters and then combine them together into ngrams
// The pipeline uses the default settings to featurize.

var charsPipeline = ml.Transforms.Text.TokenizeCharacters("Chars", "SentimentText", useMarkerCharacters: false);
var charsPipeline = ml.Transforms.Text.TokenizeIntoCharactersAsKeys("Chars", "SentimentText", useMarkerCharacters: false);
var ngramOnePipeline = ml.Transforms.Text.ProduceNgrams("CharsUnigrams", "Chars", ngramLength: 1);
var ngramTwpPipeline = ml.Transforms.Text.ProduceNgrams("CharsTwograms", "Chars");
var oneCharsPipeline = charsPipeline.Append(ngramOnePipeline);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ public static void Example()

// Let's take SentimentText column and break it into vector of words.
string originalTextColumnName = "Words";
var words = ml.Transforms.Text.TokenizeWords("SentimentText", originalTextColumnName);
var words = ml.Transforms.Text.TokenizeIntoWords("SentimentText", originalTextColumnName);

// Default pipeline will apply default stop word remover which is based on predifined set of words for certain languages.
var defaultPipeline = words.Append(ml.Transforms.Text.RemoveDefaultStopWords(originalTextColumnName, "DefaultRemover"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ public static void Example()
j.Features = features;
};

var engine = mlContext.Transforms.Text.TokenizeWords("TokenizedWords", "Sentiment_Text")
var engine = mlContext.Transforms.Text.TokenizeIntoWords("TokenizedWords", "Sentiment_Text")
.Append(mlContext.Transforms.Conversion.MapValue(lookupMap, "Words", "Ids", new ColumnOptions[] { ("VariableLenghtFeatures", "TokenizedWords") }))
.Append(mlContext.Transforms.CustomMapping(ResizeFeaturesAction, "Resize"))
.Append(tensorFlowModel.ScoreTensorFlowModel(new[] { "Prediction/Softmax" }, new[] { "Features" }))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public static void Example()

// Pipeline which goes through SentimentText and normalizes it, tokenize it by words, and removes default stopwords.
var wordsPipeline = ml.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false)
.Append(ml.Transforms.Text.TokenizeWords("Words", "NormalizedText"))
.Append(ml.Transforms.Text.TokenizeIntoWords("Words", "NormalizedText"))
.Append(ml.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words"));

var wordsDataview = wordsPipeline.Fit(trainData).Transform(trainData);
Expand Down
8 changes: 4 additions & 4 deletions src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
/// </summary>
/// <param name="input">The column to apply to.</param>
/// <param name="separators">The separators to use (uses space character by default).</param>
public static VarVector<string> TokenizeText(this Scalar<string> input, char[] separators = null) => new OutPipelineColumn(input, separators);
public static VarVector<string> TokenizeIntoWords(this Scalar<string> input, char[] separators = null) => new OutPipelineColumn(input, separators);
}

/// <summary>
Expand Down Expand Up @@ -109,7 +109,7 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
/// </summary>
/// <param name="input">The column to apply to.</param>
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
public static VarVector<Key<ushort, string>> TokenizeIntoCharacters(this Scalar<string> input, bool useMarkerCharacters = true) => new OutPipelineColumn(input, useMarkerCharacters);
public static VarVector<Key<ushort, string>> TokenizeIntoCharactersAsKeys(this Scalar<string> input, bool useMarkerCharacters = true) => new OutPipelineColumn(input, useMarkerCharacters);
}

/// <summary>
Expand Down Expand Up @@ -162,8 +162,8 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
/// Remove stop words from incoming text.
/// </summary>
/// <param name="input">The column to apply to.</param>
/// <param name="language">Langauge of the input text.</param>
public static VarVector<string> RemoveStopwords(this VarVector<string> input,
/// <param name="language">Langauge of the input text. It will be used to retrieve a built-in stopword list.</param>
public static VarVector<string> RemoveDefaultStopWords(this VarVector<string> input,
StopWordsRemovingEstimator.Language language = StopWordsRemovingEstimator.Language.English) => new OutPipelineColumn(input, language);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ private static VersionInfo GetVersionInfo()
/// <summary>
/// Defines the behavior of the transformer.
/// </summary>
public IReadOnlyCollection<StopWordsRemovingEstimator.ColumnOptions> Columns => _columns.AsReadOnly();
internal IReadOnlyCollection<StopWordsRemovingEstimator.ColumnOptions> Columns => _columns.AsReadOnly();

private readonly StopWordsRemovingEstimator.ColumnOptions[] _columns;
private static volatile NormStr.Pool[] _stopWords;
Expand Down Expand Up @@ -828,7 +828,7 @@ private void LoadStopWords(IChannel ch, ReadOnlyMemory<char> stopwords, string d
/// <summary>
/// The names of the input output column pairs on which this transformation is applied.
/// </summary>
public IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly();
internal IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly();

/// <summary>
/// Custom stopword remover removes specified list of stop words.
Expand Down
61 changes: 8 additions & 53 deletions src/Microsoft.ML.Transforms/Text/TextCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,9 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
public static TokenizingByCharactersEstimator TokenizeCharacters(this TransformsCatalog.TextTransforms catalog,
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
public static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this TransformsCatalog.TextTransforms catalog,
string outputColumnName,
string inputColumnName = null,
bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters)
Expand All @@ -67,10 +68,11 @@ public static TokenizingByCharactersEstimator TokenizeCharacters(this Transforms
/// Tokenize incoming text in input columns and output the tokens as output columns.
/// </summary>
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
/// <param name="columns">Pairs of columns to run the tokenization on.</param>

public static TokenizingByCharactersEstimator TokenizeCharacters(this TransformsCatalog.TextTransforms catalog,
public static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this TransformsCatalog.TextTransforms catalog,
bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters,
params ColumnOptions[] columns)
=> new TokenizingByCharactersEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), useMarkerCharacters, ColumnOptions.ConvertToValueTuples(columns));
Expand Down Expand Up @@ -157,29 +159,18 @@ public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.T
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
/// <param name="separators">The separators to use (uses space character by default).</param>
public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog,
public static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog.TextTransforms catalog,
string outputColumnName,
string inputColumnName = null,
char[] separators = null)
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, separators);

/// <summary>
/// Tokenizes incoming text in input columns and outputs the tokens using <paramref name="separators"/> as separators.
/// </summary>
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="columns">Pairs of columns to run the tokenization on.</param>
/// <param name="separators">The separators to use (uses space character by default).</param>
public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog,
(string outputColumnName, string inputColumnName)[] columns,
char[] separators = null)
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, separators);

/// <summary>
/// Tokenizes incoming text in input columns, using per-column configurations, and outputs the tokens.
/// </summary>
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="columns">Pairs of columns to run the tokenization on.</param>
public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog,
public static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog.TextTransforms catalog,
params WordTokenizingEstimator.ColumnOptions[] columns)
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns);

Expand Down Expand Up @@ -243,24 +234,6 @@ public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsC
StopWordsRemovingEstimator.Language language = StopWordsRemovingEstimator.Language.English)
=> new StopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, language);

/// <summary>
/// Removes stop words from incoming token streams in input columns
/// and outputs the token streams without stop words as output columns.
/// </summary>
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="columns">Pairs of columns to remove stop words on.</param>
/// <param name="language">Langauge of the input text columns <paramref name="columns"/>.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[FastTree](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs)]
/// ]]></format>
/// </example>
public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsCatalog.TextTransforms catalog,
(string outputColumnName, string inputColumnName)[] columns,
StopWordsRemovingEstimator.Language language = StopWordsRemovingEstimator.Language.English)
=> new StopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, language);

/// <summary>
/// Removes stop words from incoming token streams in <paramref name="inputColumnName"/>
/// and outputs the token streams without stopwords as <paramref name="outputColumnName"/>.
Expand All @@ -281,24 +254,6 @@ public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCa
params string[] stopwords)
=> new CustomStopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, stopwords);

/// <summary>
/// Removes stop words from incoming token streams in input columns
/// and outputs the token streams without stop words as output columns.
/// </summary>
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="columns">Pairs of columns to remove stop words on.</param>
/// <param name="stopwords">Array of words to remove.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[FastTree](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs)]
/// ]]></format>
/// </example>
public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCatalog.TextTransforms catalog,
(string outputColumnName, string inputColumnName)[] columns,
params string[] stopwords)
=> new CustomStopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, stopwords);

/// <summary>
/// Produces a bag of counts of ngrams (sequences of consecutive words) in <paramref name="inputColumnName"/>
/// and outputs bag of word vector as <paramref name="outputColumnName"/>
Expand Down
Loading

0 comments on commit 91a8703

Please sign in to comment.