搜索-推荐排序优化(更新中...)

作者:jcmp      发布时间:2021-04-23      浏览量:0
一、搜索&推荐CTR二、如何选择高质量训

一、搜索&推荐CTR

二、如何选择高质量训练样本

三、模型选择与训练

业界常用的CTR预估模型有LR和GBDT

通过spark实现GBDT+LR

import org.apache.hadoop.conf.Configurationimport org.apache.hadoop.fs.{ FileSystem, Path }import org.apache.spark.mllib.classification.{ LogisticRegressionModel, LogisticRegressionWithLBFGS }import org.apache.spark.mllib.evaluation.BinaryClassificationMetricsimport org.apache.spark.mllib.linalg.Vectorsimport org.apache.spark.ml.linalg.{ Vector => mlVector }import org.apache.spark.mllib.linalg.Vectorimport org.apache.spark.mllib.regression.LabeledPointimport org.apache.spark.mllib.tree.GradientBoostedTreesimport org.apache.spark.mllib.tree.configuration.BoostingStrategyimport org.apache.spark.mllib.tree.configuration.FeatureType._import org.apache.spark.mllib.tree.model.{ GradientBoostedTreesModel, Node }import org.apache.spark.rdd.RDDimport org.apache.spark.sql._import scala.collection.mutable.ArrayBufferobject GbdtLr { def main(args: Array[String]): Unit = { val spark = SparkSession.builder(). master("local"). appName("GbdtLr"). getOrCreate() import spark.implicits._ //1 参数准备 val iteratTree = 10 val iteratDepth = 10 val maxAuc = 0.0 val maxDepth = 15 val numTrees = 10 val minInstancesPerNode = 2 //2 训练样本准备 val dataPath = "hdfs://1.1.1.1:9000/user/data01/" //2 训练样本准备 val (trainingData, testData) = readLibSvmSampleData(spark, dataPath) trainingData.cache() testData.cache() println(s"trainingData.count(): ${trainingData.count()}") println(s"testData.count(): ${testData.count()}") println("trainingData.show") trainingData.show val data = trainingData.unionAll(testData) //3 Gbdt模型训练 val boostingStrategy = BoostingStrategy.defaultParams("Regression") boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]() boostingStrategy.treeStrategy.minInstancesPerNode = minInstancesPerNode boostingStrategy.numIterations = numTrees boostingStrategy.treeStrategy.maxDepth = maxDepth val gbdtModel = GradientBoostedTrees.train(trainingData.rdd, boostingStrategy) //4 gbdt模型解析:取出所有树的叶子节点 val treeLeafMap = getTreeLeafMap(gbdtModel) //5 样本数据转换成gbdt叶子节点编号的样本 val lrSampleLablePoint = lrSample(data.rdd, treeLeafMap, gbdtModel) val lrSplits = lrSampleLablePoint.randomSplit(Array(0.7, 0.3)) val (lrTrainingData, lrTestData) = (lrSplits(0), lrSplits(1)) lrTrainingData.cache() lrTrainingData.count() lrTestData.cache() lrTestData.count() //6 lr模型训练 val lr = new LogisticRegressionWithLBFGS().setNumClasses(2) lr.optimizer.setNumIterations(100) lr.optimizer.setRegParam(0.0) val lrModel = lr.run(lrTrainingData) //7 计算模型指标 lrModel.clearThreshold() val scoreAndLabels = lrTestData.map { point => val score = lrModel.predict(point.features) (score, point.label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val auc = metrics.areaUnderROC() val aupr = metrics.areaUnderPR() println(s"AUC: ${auc}") println(s"AUPR: ${aupr}") } /** * 读取libSVM格式的文件,生成训练样本和测试样本。 * 1)读取文件 * 2)生成标签索引 * 3)样本处理 * 4)样本划分 */ def readLibSvmSampleData( @transient spark: org.apache.spark.sql.SparkSession, dataPath: String): (Dataset[LabeledPoint], Dataset[LabeledPoint]) = { import spark.implicits._ // 2.1 读取样本 val dataRead = spark.read.options(Map(("delimiter", "|"), ("header", "false"))).csv(dataPath) // 2.2 获取样本中所有标签,并且建立索引关系 val featureMap = dataRead.map { case Row(libSvmFeatrue: String) => val items = libSvmFeatrue.split(' ') val features = items.filter(_.nonEmpty). filter(f => f.split(':').size == 2). map { item => val indexAndValue = item.split(':') indexAndValue(0) } features }.flatMap(x => x).distinct().collect().sorted.zipWithIndex.toMap val numFeatures = featureMap.size // 2.3 样本校准化处理 val readSampleData = dataRead.map { case Row(libSvmFeatrue: String) => val items = libSvmFeatrue.split(' ') val click = items(0).toString().toDouble val features = items.filter(_.nonEmpty). filter(f => f.split(':').size == 2). map { item => val indexAndValue = item.split(':') val id = featureMap.getOrElse(indexAndValue(0), -1) val value = indexAndValue(1).toDouble (id, value) }.filter(f => f._1 > 0).sortBy(f => f._1) val label = if (click > 0) 1.0 else 0.0 LabeledPoint(label, Vectors.sparse(numFeatures, features.map(_._1), features.map(_._2))) } // 2.3 划分样本 val splits = readSampleData.randomSplit(Array(0.6, 0.4)) val training = splits(0) val test = splits(1) (training, test) } /** * 根据gbdt模型对样本进行转换生成新样本 * 每个样本通过每一棵树,可以找到对应的叶节点,该叶节点就是转换后的新特征。 * @param sampleLablePoint 训练样本,格式为:RDD[LabeledPoint]. * @param treeLeafMap gbdt模型的叶子节点. * @param gbdtModel gbdt模型 * @return RDD[LabeledPoint] */ def lrSample( sampleLablePoint: RDD[LabeledPoint], lrFeatureMap: Map[String, Int], gbdtModel: GradientBoostedTreesModel): RDD[LabeledPoint] = { val treeNumber = gbdtModel.trees.length val lrFeatureNum = lrFeatureMap.size val lrSampleParsed = sampleLablePoint.map { point => val label = point.label val features = point.features val lrFeatures = ArrayBuffer[Int]() val lrValues = ArrayBuffer[Double]() val treeNumber = gbdtModel.trees.size for (treeIndex <- 0 to (treeNumber - 1)) { var node = gbdtModel.trees(treeIndex).topNode while (!node.isLeaf) { if (node.split.get.featureType == Continuous) { if (features(node.split.get.feature) <= node.split.get.threshold) node = node.leftNode.get else node = node.rightNode.get } else { if (node.split.get.categories.contains(features(node.split.get.feature))) node = node.leftNode.get else node = node.rightNode.get } } val key = treeIndex.toString + '_' + node.id lrFeatures += lrFeatureMap(key) lrValues += 1 } (label, lrFeatures.sorted.toArray, lrValues.toArray) } val lrSamplLablePoint = lrSampleParsed.map { case (label, lrFeatures, lrValues) => LabeledPoint(label, Vectors.sparse(lrFeatureNum, lrFeatures, lrValues)) } (lrSamplLablePoint) } /** * gbdt模型解析叶子节点 * @param gbdtModel gbdt模型. * @return 返回Map[String, Int],得到所有决策树的叶子节点,以及编号,数据格式为:(树id_叶子节点id, 编号) */ def getTreeLeafMap(gbdtModel: GradientBoostedTreesModel): Map[String, Int] = { val lrFeatureMap = scala.collection.mutable.Map[String, Int]() var featureId = 0 val treeNumber = gbdtModel.trees.size for (treeIndex <- 0 to (treeNumber - 1)) { val treeNodeQueue = collection.mutable.Queue[Node]() val rootNode = gbdtModel.trees(treeIndex).topNode treeNodeQueue.enqueue(rootNode) while (!treeNodeQueue.isEmpty) { val resNode = treeNodeQueue.dequeue() if (resNode.isLeaf) { val key = treeIndex.toString + '_' + resNode.id.toString() lrFeatureMap(key) = featureId featureId = featureId + 1 } if (resNode.leftNode.isDefined) treeNodeQueue.enqueue(resNode.leftNode.get) if (resNode.rightNode.isDefined) treeNodeQueue.enqueue(resNode.rightNode.get) } } (lrFeatureMap.toMap) }}

四、如何进行模型离线测试

常用的划分方式有两种,随机划分和按时间划分,其中按时间划分,可将最近两天数据做为测试集,其余数据作为训练集。