Skip to content

Commit

Permalink
feat: add several parameters related to dart boosting type (#1045)
Browse files Browse the repository at this point in the history
  • Loading branch information
imatiach-msft authored May 10, 2021
1 parent b7f29e8 commit bd63cc8
Show file tree
Hide file tree
Showing 7 changed files with 78 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,10 @@ trait LightGBMBase[TrainedModel <: Model[TrainedModel]] extends Estimator[Traine
}
}

protected def getDartParams(): DartModeParams = {
DartModeParams(getDropRate, getMaxDrop, getSkipDrop, getXGBoostDartMode, getUniformDrop)
}

/**
* Inner train method for LightGBM learners. Calculates the number of workers,
* creates a driver thread, and runs mapPartitions on the dataset.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class LightGBMClassifier(override val uid: String)
getIsUnbalance, getVerbosity, categoricalIndexes, actualNumClasses, getBoostFromAverage,
getBoostingType, getLambdaL1, getLambdaL2, getIsProvideTrainingMetric,
getMetric, getMinGainToSplit, getMaxDeltaStep, getMaxBinByFeature, getMinDataInLeaf, getSlotNames,
getDelegate, getChunkSize)
getDelegate, getChunkSize, getDartParams())
}

def getModel(trainParams: TrainParams, lightGBMBooster: LightGBMBooster): LightGBMClassificationModel = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,45 @@ trait LightGBMBinParams extends Wrappable {
def setBinSampleCount(value: Int): this.type = set(binSampleCount, value)
}

/** Defines parameters for dart mode across all LightGBM learners.
*/
trait LightGBMDartParams extends Wrappable {
val dropRate = new DoubleParam(this, "dropRate",
"Dropout rate: a fraction of previous trees to drop during the dropout")
setDefault(dropRate -> 0.1)

def getDropRate: Double = $(dropRate)
def setDropRate(value: Double): this.type = set(dropRate, value)

val maxDrop = new IntParam(this, "maxDrop",
"Max number of dropped trees during one boosting iteration")
setDefault(maxDrop -> 50)

def getMaxDrop: Int = $(maxDrop)
def setMaxDrop(value: Int): this.type = set(maxDrop, value)

val skipDrop = new DoubleParam(this, "skipDrop",
"Probability of skipping the dropout procedure during a boosting iteration")
setDefault(skipDrop -> 0.5)

def getSkipDrop: Double = $(skipDrop)
def setSkipDrop(value: Double): this.type = set(skipDrop, value)

val xgboostDartMode = new BooleanParam(this, "xgboostDartMode",
"Set this to true to use xgboost dart mode")
setDefault(xgboostDartMode -> false)

def getXGBoostDartMode: Boolean = $(xgboostDartMode)
def setXGBoostDartMode(value: Boolean): this.type = set(xgboostDartMode, value)

val uniformDrop = new BooleanParam(this, "uniformDrop",
"Set this to true to use uniform drop in dart mode")
setDefault(uniformDrop -> false)

def getUniformDrop: Boolean = $(uniformDrop)
def setUniformDrop(value: Boolean): this.type = set(uniformDrop, value)
}

/** Defines parameters for slots across all LightGBM learners.
*/
trait LightGBMSlotParams extends Wrappable {
Expand Down Expand Up @@ -231,7 +270,7 @@ trait LightGBMModelParams extends Wrappable {
trait LightGBMParams extends Wrappable with DefaultParamsWritable with HasWeightCol
with HasValidationIndicatorCol with HasInitScoreCol with LightGBMExecutionParams
with LightGBMSlotParams with LightGBMFractionParams with LightGBMBinParams with LightGBMLearnerParams
with LightGBMPredictionParams {
with LightGBMDartParams with LightGBMPredictionParams {
val numIterations = new IntParam(this, "numIterations",
"Number of iterations, LightGBM constructs num_class * num_iterations trees")
setDefault(numIterations->100)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class LightGBMRanker(override val uid: String)
getFeatureFraction, getMaxDepth, getMinSumHessianInLeaf, numTasks, modelStr,
getVerbosity, categoricalIndexes, getBoostingType, getLambdaL1, getLambdaL2, getMaxPosition, getLabelGain,
getIsProvideTrainingMetric, getMetric, getEvalAt, getMinGainToSplit, getMaxDeltaStep,
getMaxBinByFeature, getMinDataInLeaf, getSlotNames, getDelegate, getChunkSize)
getMaxBinByFeature, getMinDataInLeaf, getSlotNames, getDelegate, getChunkSize, getDartParams())
}

def getModel(trainParams: TrainParams, lightGBMBooster: LightGBMBooster): LightGBMRankerModel = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class LightGBMRegressor(override val uid: String)
getEarlyStoppingRound, getImprovementTolerance, getFeatureFraction, getMaxDepth, getMinSumHessianInLeaf,
numTasks, modelStr, getVerbosity, categoricalIndexes, getBoostFromAverage, getBoostingType, getLambdaL1,
getLambdaL2, getIsProvideTrainingMetric, getMetric, getMinGainToSplit, getMaxDeltaStep,
getMaxBinByFeature, getMinDataInLeaf, getSlotNames, getDelegate, getChunkSize)
getMaxBinByFeature, getMinDataInLeaf, getSlotNames, getDelegate, getChunkSize, getDartParams())
}

def getModel(trainParams: TrainParams, lightGBMBooster: LightGBMBooster): LightGBMRegressionModel = {
Expand Down
22 changes: 17 additions & 5 deletions src/main/scala/com/microsoft/ml/spark/lightgbm/TrainParams.scala
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ abstract class TrainParams extends Serializable {
def featureNames: Array[String]
def delegate: Option[LightGBMDelegate]
def chunkSize: Int
def dartModeParams: DartModeParams

override def toString: String = {
// Since passing `isProvideTrainingMetric` to LightGBM as a config parameter won't work,
Expand All @@ -51,10 +52,11 @@ abstract class TrainParams extends Serializable {
s"bagging_seed=$baggingSeed early_stopping_round=$earlyStoppingRound " +
s"feature_fraction=$featureFraction max_depth=$maxDepth min_sum_hessian_in_leaf=$minSumHessianInLeaf " +
s"num_machines=$numMachines objective=$objective verbosity=$verbosity " +
s"lambda_l1=$lambdaL1 lambda_l2=$lambdaL2 metric=$metric min_gain_to_split=$minGainToSplit " +
s"lambda_l1=$lambdaL1 lambda_l2=$lambdaL2 metric=$metric min_gain_to_split=$minGainToSplit " +
s"max_delta_step=$maxDeltaStep min_data_in_leaf=$minDataInLeaf " +
(if (categoricalFeatures.isEmpty) "" else s"categorical_feature=${categoricalFeatures.mkString(",")} ") +
(if (maxBinByFeature.isEmpty) "" else s"max_bin_by_feature=${maxBinByFeature.mkString(",")}")
(if (maxBinByFeature.isEmpty) "" else s"max_bin_by_feature=${maxBinByFeature.mkString(",")} ") +
(if (boostingType == "dart") s"${dartModeParams.toString()}" else "")
}
}

Expand All @@ -73,7 +75,7 @@ case class ClassifierTrainParams(parallelism: String, topK: Int, numIterations:
isProvideTrainingMetric: Boolean, metric: String, minGainToSplit: Double,
maxDeltaStep: Double, maxBinByFeature: Array[Int], minDataInLeaf: Int,
featureNames: Array[String], delegate: Option[LightGBMDelegate],
chunkSize: Int)
chunkSize: Int, dartModeParams: DartModeParams)
extends TrainParams {
override def toString(): String = {
val extraStr =
Expand All @@ -98,7 +100,7 @@ case class RegressorTrainParams(parallelism: String, topK: Int, numIterations: I
isProvideTrainingMetric: Boolean, metric: String, minGainToSplit: Double,
maxDeltaStep: Double, maxBinByFeature: Array[Int], minDataInLeaf: Int,
featureNames: Array[String], delegate: Option[LightGBMDelegate],
chunkSize: Int)
chunkSize: Int, dartModeParams: DartModeParams)
extends TrainParams {
override def toString(): String = {
s"alpha=$alpha tweedie_variance_power=$tweedieVariancePower boost_from_average=${boostFromAverage.toString} " +
Expand All @@ -120,7 +122,7 @@ case class RankerTrainParams(parallelism: String, topK: Int, numIterations: Int,
metric: String, evalAt: Array[Int], minGainToSplit: Double,
maxDeltaStep: Double, maxBinByFeature: Array[Int], minDataInLeaf: Int,
featureNames: Array[String], delegate: Option[LightGBMDelegate],
chunkSize: Int)
chunkSize: Int, dartModeParams: DartModeParams)
extends TrainParams {
override def toString(): String = {
val labelGainStr =
Expand All @@ -130,3 +132,13 @@ case class RankerTrainParams(parallelism: String, topK: Int, numIterations: Int,
s"max_position=$maxPosition $labelGainStr $evalAtStr ${super.toString()}"
}
}

/** Defines the dart mode parameters passed to the LightGBM learners.
*/
case class DartModeParams(dropRate: Double, maxDrop: Int, skipDrop: Double,
xgboostDartMode: Boolean, uniformDrop: Boolean) extends Serializable {
override def toString(): String = {
s"drop_rate=$dropRate max_drop=$maxDrop skip_drop=$skipDrop xgboost_dart_mode=$xgboostDartMode " +
s"uniform_drop=$uniformDrop "
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,20 @@ class VerifyLightGBMClassifier extends Benchmarks with EstimatorFuzzing[LightGBM
assertBinaryImprovement(scoredDF1, scoredDF2)
}

test("Verify LightGBM Classifier with dart mode parameters") {
// Assert the dart parameters work without failing and setting them to tuned values improves performance
val Array(train, test) = pimaDF.randomSplit(Array(0.8, 0.2), seed)
val scoredDF1 = baseModel.setBoostingType("dart").fit(train).transform(test)
val scoredDF2 = baseModel.setBoostingType("dart")
.setXGBoostDartMode(true)
.setDropRate(0.6)
.setMaxDrop(60)
.setSkipDrop(0.6)
.setUniformDrop(true)
.fit(train).transform(test)
assertBinaryImprovement(scoredDF1, scoredDF2)
}

test("Verify LightGBM Classifier with num tasks parameter") {
val numTasks = Array(0, 1, 2)
numTasks.foreach(nTasks => assertFitWithoutErrors(baseModel.setNumTasks(nTasks), pimaDF))
Expand Down

0 comments on commit bd63cc8

Please sign in to comment.