mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-12-22 09:55:32 +00:00
Open-sourcing Timelines Aggregation Framework
Open sourcing Aggregation Framework, a config-driven Summingbird based framework for generating real-time and batch aggregate features to be consumed by ML models.
This commit is contained in:
parent
b5e849b029
commit
197bf2c563
|
@ -20,6 +20,7 @@ Product surfaces at Twitter are built on a shared set of data, models, and softw
|
|||
| | [topic-social-proof](topic-social-proof/README.md) | Identifies topics related to individual Tweets. |
|
||||
| Software framework | [navi](navi/README.md) | High performance, machine learning model serving written in Rust. |
|
||||
| | [product-mixer](product-mixer/README.md) | Software framework for building feeds of content. |
|
||||
| | [timelines-aggregation-framework](timelines/data_processing/ml_util/aggregation_framework/README.md) | Framework for generating aggregate features in batch or real time.
|
||||
| | [twml](twml/README.md) | Legacy machine learning framework built on TensorFlow v1. |
|
||||
|
||||
The product surface currently included in this repository is the For You Timeline.
|
||||
|
|
|
@ -0,0 +1,68 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates
|
||||
|
||||
import com.twitter.ml.api.Feature
|
||||
import com.twitter.ml.api.FeatureContext
|
||||
import com.twitter.ml.api.ITransform
|
||||
import com.twitter.ml.api.constant.SharedFeatures
|
||||
import java.lang.{Double => JDouble}
|
||||
|
||||
import com.twitter.timelines.prediction.common.adapters.AdapterConsumer
|
||||
import com.twitter.timelines.prediction.common.adapters.EngagementLabelFeaturesDataRecordUtils
|
||||
import com.twitter.ml.api.DataRecord
|
||||
import com.twitter.ml.api.RichDataRecord
|
||||
import com.twitter.timelines.suggests.common.engagement.thriftscala.EngagementType
|
||||
import com.twitter.timelines.suggests.common.engagement.thriftscala.Engagement
|
||||
import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures
|
||||
import com.twitter.timelines.prediction.features.common.CombinedFeatures
|
||||
|
||||
/**
|
||||
* To transfrom BCE events UUA data records that contain only continuous dwell time to datarecords that contain corresponding binary label features
|
||||
* The UUA datarecords inputted would have USER_ID, SOURCE_TWEET_ID,TIMESTAMP and
|
||||
* 0 or one of (TWEET_DETAIL_DWELL_TIME_MS, PROFILE_DWELL_TIME_MS, FULLSCREEN_VIDEO_DWELL_TIME_MS) features.
|
||||
* We will use the different engagement TIME_MS to differentiate different engagements,
|
||||
* and then re-use the function in EngagementTypeConverte to add the binary label to the datarecord.
|
||||
**/
|
||||
|
||||
object BCELabelTransformFromUUADataRecord extends ITransform {
|
||||
|
||||
val dwellTimeFeatureToEngagementMap = Map(
|
||||
TimelinesSharedFeatures.TWEET_DETAIL_DWELL_TIME_MS -> EngagementType.TweetDetailDwell,
|
||||
TimelinesSharedFeatures.PROFILE_DWELL_TIME_MS -> EngagementType.ProfileDwell,
|
||||
TimelinesSharedFeatures.FULLSCREEN_VIDEO_DWELL_TIME_MS -> EngagementType.FullscreenVideoDwell
|
||||
)
|
||||
|
||||
def dwellFeatureToEngagement(
|
||||
rdr: RichDataRecord,
|
||||
dwellTimeFeature: Feature[JDouble],
|
||||
engagementType: EngagementType
|
||||
): Option[Engagement] = {
|
||||
if (rdr.hasFeature(dwellTimeFeature)) {
|
||||
Some(
|
||||
Engagement(
|
||||
engagementType = engagementType,
|
||||
timestampMs = rdr.getFeatureValue(SharedFeatures.TIMESTAMP),
|
||||
weight = Some(rdr.getFeatureValue(dwellTimeFeature))
|
||||
))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
override def transformContext(featureContext: FeatureContext): FeatureContext = {
|
||||
featureContext.addFeatures(
|
||||
(CombinedFeatures.TweetDetailDwellEngagements ++ CombinedFeatures.ProfileDwellEngagements ++ CombinedFeatures.FullscreenVideoDwellEngagements).toSeq: _*)
|
||||
}
|
||||
override def transform(record: DataRecord): Unit = {
|
||||
val rdr = new RichDataRecord(record)
|
||||
val engagements = dwellTimeFeatureToEngagementMap
|
||||
.map {
|
||||
case (dwellTimeFeature, engagementType) =>
|
||||
dwellFeatureToEngagement(rdr, dwellTimeFeature, engagementType)
|
||||
}.flatten.toSeq
|
||||
|
||||
// Re-use BCE( behavior client events) label conversion in EngagementTypeConverter to align with BCE labels generation for offline training data
|
||||
EngagementLabelFeaturesDataRecordUtils.setDwellTimeFeatures(
|
||||
rdr,
|
||||
Some(engagements),
|
||||
AdapterConsumer.Combined)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,353 @@
|
|||
create_datasets(
|
||||
base_name = "original_author_aggregates",
|
||||
fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/original_author_aggregates/1556496000000",
|
||||
key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey",
|
||||
platform = "java8",
|
||||
role = "timelines",
|
||||
scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.OriginalAuthor",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)",
|
||||
scala_dependencies = [
|
||||
":injections",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "twitter_wide_user_aggregates",
|
||||
fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/twitter_wide_user_aggregates/1556496000000",
|
||||
key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey",
|
||||
platform = "java8",
|
||||
role = "timelines",
|
||||
scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.TwitterWideUser",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)",
|
||||
scala_dependencies = [
|
||||
":injections",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "twitter_wide_user_author_aggregates",
|
||||
fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/twitter_wide_user_author_aggregates/1556323200000",
|
||||
key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey",
|
||||
platform = "java8",
|
||||
role = "timelines",
|
||||
scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.TwitterWideUserAuthor",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)",
|
||||
scala_dependencies = [
|
||||
":injections",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "user_aggregates",
|
||||
fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_aggregates/1556150400000",
|
||||
key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey",
|
||||
platform = "java8",
|
||||
role = "timelines",
|
||||
scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.User",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)",
|
||||
scala_dependencies = [
|
||||
":injections",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "user_author_aggregates",
|
||||
fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_author_aggregates/1556064000000",
|
||||
key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey",
|
||||
platform = "java8",
|
||||
role = "timelines",
|
||||
scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserAuthor",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)",
|
||||
scala_dependencies = [
|
||||
":injections",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "aggregates_canary",
|
||||
fallback_path = "gs://user.timelines.dp.gcp.twttr.net//canaries/processed/aggregates_v2/user_aggregates/1622851200000",
|
||||
key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey",
|
||||
platform = "java8",
|
||||
role = "timelines",
|
||||
scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.User",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)",
|
||||
scala_dependencies = [
|
||||
":injections",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "user_engager_aggregates",
|
||||
fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_engager_aggregates/1556496000000",
|
||||
key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey",
|
||||
platform = "java8",
|
||||
role = "timelines",
|
||||
scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserEngager",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)",
|
||||
scala_dependencies = [
|
||||
":injections",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "user_original_author_aggregates",
|
||||
fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_original_author_aggregates/1556496000000",
|
||||
key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey",
|
||||
platform = "java8",
|
||||
role = "timelines",
|
||||
scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserOriginalAuthor",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)",
|
||||
scala_dependencies = [
|
||||
":injections",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "author_topic_aggregates",
|
||||
fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/author_topic_aggregates/1589932800000",
|
||||
key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey",
|
||||
platform = "java8",
|
||||
role = "timelines",
|
||||
scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.AuthorTopic",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)",
|
||||
scala_dependencies = [
|
||||
":injections",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "user_topic_aggregates",
|
||||
fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_topic_aggregates/1590278400000",
|
||||
key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey",
|
||||
platform = "java8",
|
||||
role = "timelines",
|
||||
scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserTopic",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)",
|
||||
scala_dependencies = [
|
||||
":injections",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "user_inferred_topic_aggregates",
|
||||
fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_inferred_topic_aggregates/1599696000000",
|
||||
key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey",
|
||||
platform = "java8",
|
||||
role = "timelines",
|
||||
scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserInferredTopic",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)",
|
||||
scala_dependencies = [
|
||||
":injections",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "user_mention_aggregates",
|
||||
fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_mention_aggregates/1556582400000",
|
||||
key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey",
|
||||
platform = "java8",
|
||||
role = "timelines",
|
||||
scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserMention",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)",
|
||||
scala_dependencies = [
|
||||
":injections",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "user_request_dow_aggregates",
|
||||
fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_request_dow_aggregates/1556236800000",
|
||||
key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey",
|
||||
platform = "java8",
|
||||
role = "timelines",
|
||||
scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserRequestDow",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)",
|
||||
scala_dependencies = [
|
||||
":injections",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "user_request_hour_aggregates",
|
||||
fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_request_hour_aggregates/1556150400000",
|
||||
key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey",
|
||||
platform = "java8",
|
||||
role = "timelines",
|
||||
scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserRequestHour",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)",
|
||||
scala_dependencies = [
|
||||
":injections",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
create_datasets(
|
||||
base_name = "user_list_aggregates",
|
||||
fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_list_aggregates/1590624000000",
|
||||
key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey",
|
||||
platform = "java8",
|
||||
role = "timelines",
|
||||
scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserList",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)",
|
||||
scala_dependencies = [
|
||||
":injections",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
create_datasets(
|
||||
base_name = "user_media_understanding_annotation_aggregates",
|
||||
key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey",
|
||||
platform = "java8",
|
||||
role = "timelines",
|
||||
scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserMediaUnderstandingAnnotation",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)",
|
||||
scala_dependencies = [
|
||||
":injections",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
],
|
||||
)
|
||||
|
||||
scala_library(
|
||||
sources = [
|
||||
"BCELabelTransformFromUUADataRecord.scala",
|
||||
"FeatureSelectorConfig.scala",
|
||||
"RecapUserFeatureAggregation.scala",
|
||||
"RectweetUserFeatureAggregation.scala",
|
||||
"TimelinesAggregationConfig.scala",
|
||||
"TimelinesAggregationConfigDetails.scala",
|
||||
"TimelinesAggregationConfigTrait.scala",
|
||||
"TimelinesAggregationSources.scala",
|
||||
],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
":aggregates_canary-scala",
|
||||
":author_topic_aggregates-scala",
|
||||
":original_author_aggregates-scala",
|
||||
":twitter_wide_user_aggregates-scala",
|
||||
":twitter_wide_user_author_aggregates-scala",
|
||||
":user_aggregates-scala",
|
||||
":user_author_aggregates-scala",
|
||||
":user_engager_aggregates-scala",
|
||||
":user_inferred_topic_aggregates-scala",
|
||||
":user_list_aggregates-scala",
|
||||
":user_media_understanding_annotation_aggregates-scala",
|
||||
":user_mention_aggregates-scala",
|
||||
":user_original_author_aggregates-scala",
|
||||
":user_request_dow_aggregates-scala",
|
||||
":user_request_hour_aggregates-scala",
|
||||
":user_topic_aggregates-scala",
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/java/com/twitter/ml/api/constant",
|
||||
"src/java/com/twitter/ml/api/matcher",
|
||||
"src/scala/com/twitter/common/text/util",
|
||||
"src/scala/com/twitter/dal/client/dataset",
|
||||
"src/scala/com/twitter/frigate/data_pipeline/features_aggregated/core",
|
||||
"src/scala/com/twitter/scalding_internal/multiformat/format",
|
||||
"src/scala/com/twitter/timelines/prediction/common/adapters:engagement-converter",
|
||||
"src/scala/com/twitter/timelines/prediction/features/client_log_event",
|
||||
"src/scala/com/twitter/timelines/prediction/features/common",
|
||||
"src/scala/com/twitter/timelines/prediction/features/engagement_features",
|
||||
"src/scala/com/twitter/timelines/prediction/features/escherbird",
|
||||
"src/scala/com/twitter/timelines/prediction/features/itl",
|
||||
"src/scala/com/twitter/timelines/prediction/features/list_features",
|
||||
"src/scala/com/twitter/timelines/prediction/features/p_home_latest",
|
||||
"src/scala/com/twitter/timelines/prediction/features/real_graph",
|
||||
"src/scala/com/twitter/timelines/prediction/features/recap",
|
||||
"src/scala/com/twitter/timelines/prediction/features/request_context",
|
||||
"src/scala/com/twitter/timelines/prediction/features/simcluster",
|
||||
"src/scala/com/twitter/timelines/prediction/features/time_features",
|
||||
"src/scala/com/twitter/timelines/prediction/transform/filter",
|
||||
"src/thrift/com/twitter/timelines/suggests/common:engagement-scala",
|
||||
"timelines/data_processing/ad_hoc/recap/data_record_preparation:recap_data_records_agg_minimal-java",
|
||||
"util/util-core:scala",
|
||||
],
|
||||
)
|
||||
|
||||
scala_library(
|
||||
name = "injections",
|
||||
sources = [
|
||||
"FeatureSelectorConfig.scala",
|
||||
"RecapUserFeatureAggregation.scala",
|
||||
"RectweetUserFeatureAggregation.scala",
|
||||
"TimelinesAggregationConfigDetails.scala",
|
||||
"TimelinesAggregationConfigTrait.scala",
|
||||
"TimelinesAggregationKeyValInjections.scala",
|
||||
"TimelinesAggregationSources.scala",
|
||||
],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/java/com/twitter/ml/api/constant",
|
||||
"src/java/com/twitter/ml/api/matcher",
|
||||
"src/scala/com/twitter/common/text/util",
|
||||
"src/scala/com/twitter/dal/client/dataset",
|
||||
"src/scala/com/twitter/frigate/data_pipeline/features_aggregated/core",
|
||||
"src/scala/com/twitter/scalding_internal/multiformat/format",
|
||||
"src/scala/com/twitter/timelines/prediction/features/client_log_event",
|
||||
"src/scala/com/twitter/timelines/prediction/features/common",
|
||||
"src/scala/com/twitter/timelines/prediction/features/engagement_features",
|
||||
"src/scala/com/twitter/timelines/prediction/features/escherbird",
|
||||
"src/scala/com/twitter/timelines/prediction/features/itl",
|
||||
"src/scala/com/twitter/timelines/prediction/features/list_features",
|
||||
"src/scala/com/twitter/timelines/prediction/features/p_home_latest",
|
||||
"src/scala/com/twitter/timelines/prediction/features/real_graph",
|
||||
"src/scala/com/twitter/timelines/prediction/features/recap",
|
||||
"src/scala/com/twitter/timelines/prediction/features/request_context",
|
||||
"src/scala/com/twitter/timelines/prediction/features/semantic_core_features",
|
||||
"src/scala/com/twitter/timelines/prediction/features/simcluster",
|
||||
"src/scala/com/twitter/timelines/prediction/features/time_features",
|
||||
"src/scala/com/twitter/timelines/prediction/transform/filter",
|
||||
"timelines/data_processing/ad_hoc/recap/data_record_preparation:recap_data_records_agg_minimal-java",
|
||||
"util/util-core:scala",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,121 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates
|
||||
|
||||
import com.twitter.ml.api.matcher.FeatureMatcher
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object FeatureSelectorConfig {
|
||||
val BasePairsToStore = Seq(
|
||||
("twitter_wide_user_aggregate.pair", "*"),
|
||||
("twitter_wide_user_author_aggregate.pair", "*"),
|
||||
("user_aggregate_v5.continuous.pair", "*"),
|
||||
("user_aggregate_v7.pair", "*"),
|
||||
("user_author_aggregate_v2.pair", "recap.earlybird.*"),
|
||||
("user_author_aggregate_v2.pair", "recap.searchfeature.*"),
|
||||
("user_author_aggregate_v2.pair", "recap.tweetfeature.embeds*"),
|
||||
("user_author_aggregate_v2.pair", "recap.tweetfeature.link_count*"),
|
||||
("user_author_aggregate_v2.pair", "engagement_features.in_network.*"),
|
||||
("user_author_aggregate_v2.pair", "recap.tweetfeature.is_reply.*"),
|
||||
("user_author_aggregate_v2.pair", "recap.tweetfeature.is_retweet.*"),
|
||||
("user_author_aggregate_v2.pair", "recap.tweetfeature.num_mentions.*"),
|
||||
("user_author_aggregate_v5.pair", "*"),
|
||||
("user_author_aggregate_tweetsource_v1.pair", "*"),
|
||||
("user_engager_aggregate.pair", "*"),
|
||||
("user_mention_aggregate.pair", "*"),
|
||||
("user_request_context_aggregate.dow.pair", "*"),
|
||||
("user_request_context_aggregate.hour.pair", "*"),
|
||||
("user_aggregate_v6.pair", "*"),
|
||||
("user_original_author_aggregate_v1.pair", "*"),
|
||||
("user_original_author_aggregate_v2.pair", "*"),
|
||||
("original_author_aggregate_v1.pair", "*"),
|
||||
("original_author_aggregate_v2.pair", "*"),
|
||||
("author_topic_aggregate.pair", "*"),
|
||||
("user_list_aggregate.pair", "*"),
|
||||
("user_topic_aggregate.pair", "*"),
|
||||
("user_topic_aggregate_v2.pair", "*"),
|
||||
("user_inferred_topic_aggregate.pair", "*"),
|
||||
("user_inferred_topic_aggregate_v2.pair", "*"),
|
||||
("user_media_annotation_aggregate.pair", "*"),
|
||||
("user_media_annotation_aggregate.pair", "*"),
|
||||
("user_author_good_click_aggregate.pair", "*"),
|
||||
("user_engager_good_click_aggregate.pair", "*")
|
||||
)
|
||||
val PairsToStore = BasePairsToStore ++ Seq(
|
||||
("user_aggregate_v2.pair", "*"),
|
||||
("user_aggregate_v5.boolean.pair", "*"),
|
||||
("user_aggregate_tweetsource_v1.pair", "*"),
|
||||
)
|
||||
|
||||
|
||||
val LabelsToStore = Seq(
|
||||
"any_label",
|
||||
"recap.engagement.is_favorited",
|
||||
"recap.engagement.is_retweeted",
|
||||
"recap.engagement.is_replied",
|
||||
"recap.engagement.is_open_linked",
|
||||
"recap.engagement.is_profile_clicked",
|
||||
"recap.engagement.is_clicked",
|
||||
"recap.engagement.is_photo_expanded",
|
||||
"recap.engagement.is_video_playback_50",
|
||||
"recap.engagement.is_video_quality_viewed",
|
||||
"recap.engagement.is_replied_reply_impressed_by_author",
|
||||
"recap.engagement.is_replied_reply_favorited_by_author",
|
||||
"recap.engagement.is_replied_reply_replied_by_author",
|
||||
"recap.engagement.is_report_tweet_clicked",
|
||||
"recap.engagement.is_block_clicked",
|
||||
"recap.engagement.is_mute_clicked",
|
||||
"recap.engagement.is_dont_like",
|
||||
"recap.engagement.is_good_clicked_convo_desc_favorited_or_replied",
|
||||
"recap.engagement.is_good_clicked_convo_desc_v2",
|
||||
"itl.engagement.is_favorited",
|
||||
"itl.engagement.is_retweeted",
|
||||
"itl.engagement.is_replied",
|
||||
"itl.engagement.is_open_linked",
|
||||
"itl.engagement.is_profile_clicked",
|
||||
"itl.engagement.is_clicked",
|
||||
"itl.engagement.is_photo_expanded",
|
||||
"itl.engagement.is_video_playback_50"
|
||||
)
|
||||
|
||||
val PairGlobsToStore = for {
|
||||
(prefix, suffix) <- PairsToStore
|
||||
label <- LabelsToStore
|
||||
} yield FeatureMatcher.glob(prefix + "." + label + "." + suffix)
|
||||
|
||||
val BaseAggregateV2FeatureSelector = FeatureMatcher
|
||||
.none()
|
||||
.or(
|
||||
FeatureMatcher.glob("meta.user_id"),
|
||||
FeatureMatcher.glob("meta.author_id"),
|
||||
FeatureMatcher.glob("entities.original_author_id"),
|
||||
FeatureMatcher.glob("entities.topic_id"),
|
||||
FeatureMatcher
|
||||
.glob("entities.inferred_topic_ids" + TypedAggregateGroup.SparseFeatureSuffix),
|
||||
FeatureMatcher.glob("timelines.meta.list_id"),
|
||||
FeatureMatcher.glob("list.id"),
|
||||
FeatureMatcher
|
||||
.glob("engagement_features.user_ids.public" + TypedAggregateGroup.SparseFeatureSuffix),
|
||||
FeatureMatcher
|
||||
.glob("entities.users.mentioned_screen_names" + TypedAggregateGroup.SparseFeatureSuffix),
|
||||
FeatureMatcher.glob("user_aggregate_v2.pair.recap.engagement.is_dont_like.*"),
|
||||
FeatureMatcher.glob("user_author_aggregate_v2.pair.any_label.recap.tweetfeature.has_*"),
|
||||
FeatureMatcher.glob("request_context.country_code"),
|
||||
FeatureMatcher.glob("request_context.timestamp_gmt_dow"),
|
||||
FeatureMatcher.glob("request_context.timestamp_gmt_hour"),
|
||||
FeatureMatcher.glob(
|
||||
"semantic_core.media_understanding.high_recall.non_sensitive.entity_ids" + TypedAggregateGroup.SparseFeatureSuffix)
|
||||
)
|
||||
|
||||
val AggregatesV2ProdFeatureSelector = BaseAggregateV2FeatureSelector
|
||||
.orList(PairGlobsToStore.asJava)
|
||||
|
||||
val ReducedPairGlobsToStore = (for {
|
||||
(prefix, suffix) <- BasePairsToStore
|
||||
label <- LabelsToStore
|
||||
} yield FeatureMatcher.glob(prefix + "." + label + "." + suffix)) ++ Seq(
|
||||
FeatureMatcher.glob("user_aggregate_v2.pair.any_label.*"),
|
||||
FeatureMatcher.glob("user_aggregate_v2.pair.recap.engagement.is_favorited.*"),
|
||||
FeatureMatcher.glob("user_aggregate_v2.pair.recap.engagement.is_photo_expanded.*"),
|
||||
FeatureMatcher.glob("user_aggregate_v2.pair.recap.engagement.is_profile_clicked.*")
|
||||
)
|
||||
}
|
|
@ -0,0 +1,6 @@
|
|||
## Timelines Aggregation Jobs
|
||||
|
||||
This directory contains the specific definition of aggregate jobs that generate features used by the Heavy Ranker.
|
||||
The primary files of interest are [`TimelinesAggregationConfigDetails.scala`](TimelinesAggregationConfigDetails.scala), which contains the defintion for the batch aggregate jobs and [`real_time/TimelinesOnlineAggregationConfigBase.scala`](real_time/TimelinesOnlineAggregationConfigBase.scala) which contains the definitions for the real time aggregate jobs.
|
||||
|
||||
The aggregation framework that these jobs are based on is [here](../../../../../../../../timelines/data_processing/ml_util/aggregation_framework).
|
|
@ -0,0 +1,415 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates
|
||||
|
||||
import com.twitter.ml.api.Feature
|
||||
import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures
|
||||
import com.twitter.timelines.prediction.features.engagement_features.EngagementDataRecordFeatures
|
||||
import com.twitter.timelines.prediction.features.real_graph.RealGraphDataRecordFeatures
|
||||
import com.twitter.timelines.prediction.features.recap.RecapFeatures
|
||||
import com.twitter.timelines.prediction.features.time_features.TimeDataRecordFeatures
|
||||
|
||||
object RecapUserFeatureAggregation {
|
||||
val RecapFeaturesForAggregation: Set[Feature[_]] =
|
||||
Set(
|
||||
RecapFeatures.HAS_IMAGE,
|
||||
RecapFeatures.HAS_VIDEO,
|
||||
RecapFeatures.FROM_MUTUAL_FOLLOW,
|
||||
RecapFeatures.HAS_CARD,
|
||||
RecapFeatures.HAS_NEWS,
|
||||
RecapFeatures.REPLY_COUNT,
|
||||
RecapFeatures.FAV_COUNT,
|
||||
RecapFeatures.RETWEET_COUNT,
|
||||
RecapFeatures.BLENDER_SCORE,
|
||||
RecapFeatures.CONVERSATIONAL_COUNT,
|
||||
RecapFeatures.IS_BUSINESS_SCORE,
|
||||
RecapFeatures.CONTAINS_MEDIA,
|
||||
RecapFeatures.RETWEET_SEARCHER,
|
||||
RecapFeatures.REPLY_SEARCHER,
|
||||
RecapFeatures.MENTION_SEARCHER,
|
||||
RecapFeatures.REPLY_OTHER,
|
||||
RecapFeatures.RETWEET_OTHER,
|
||||
RecapFeatures.MATCH_UI_LANG,
|
||||
RecapFeatures.MATCH_SEARCHER_MAIN_LANG,
|
||||
RecapFeatures.MATCH_SEARCHER_LANGS,
|
||||
RecapFeatures.TWEET_COUNT_FROM_USER_IN_SNAPSHOT,
|
||||
RecapFeatures.TEXT_SCORE,
|
||||
RealGraphDataRecordFeatures.NUM_RETWEETS_EWMA,
|
||||
RealGraphDataRecordFeatures.NUM_RETWEETS_NON_ZERO_DAYS,
|
||||
RealGraphDataRecordFeatures.NUM_RETWEETS_ELAPSED_DAYS,
|
||||
RealGraphDataRecordFeatures.NUM_RETWEETS_DAYS_SINCE_LAST,
|
||||
RealGraphDataRecordFeatures.NUM_FAVORITES_EWMA,
|
||||
RealGraphDataRecordFeatures.NUM_FAVORITES_NON_ZERO_DAYS,
|
||||
RealGraphDataRecordFeatures.NUM_FAVORITES_ELAPSED_DAYS,
|
||||
RealGraphDataRecordFeatures.NUM_FAVORITES_DAYS_SINCE_LAST,
|
||||
RealGraphDataRecordFeatures.NUM_MENTIONS_EWMA,
|
||||
RealGraphDataRecordFeatures.NUM_MENTIONS_NON_ZERO_DAYS,
|
||||
RealGraphDataRecordFeatures.NUM_MENTIONS_ELAPSED_DAYS,
|
||||
RealGraphDataRecordFeatures.NUM_MENTIONS_DAYS_SINCE_LAST,
|
||||
RealGraphDataRecordFeatures.NUM_TWEET_CLICKS_EWMA,
|
||||
RealGraphDataRecordFeatures.NUM_TWEET_CLICKS_NON_ZERO_DAYS,
|
||||
RealGraphDataRecordFeatures.NUM_TWEET_CLICKS_ELAPSED_DAYS,
|
||||
RealGraphDataRecordFeatures.NUM_TWEET_CLICKS_DAYS_SINCE_LAST,
|
||||
RealGraphDataRecordFeatures.NUM_PROFILE_VIEWS_EWMA,
|
||||
RealGraphDataRecordFeatures.NUM_PROFILE_VIEWS_NON_ZERO_DAYS,
|
||||
RealGraphDataRecordFeatures.NUM_PROFILE_VIEWS_ELAPSED_DAYS,
|
||||
RealGraphDataRecordFeatures.NUM_PROFILE_VIEWS_DAYS_SINCE_LAST,
|
||||
RealGraphDataRecordFeatures.TOTAL_DWELL_TIME_EWMA,
|
||||
RealGraphDataRecordFeatures.TOTAL_DWELL_TIME_NON_ZERO_DAYS,
|
||||
RealGraphDataRecordFeatures.TOTAL_DWELL_TIME_ELAPSED_DAYS,
|
||||
RealGraphDataRecordFeatures.TOTAL_DWELL_TIME_DAYS_SINCE_LAST,
|
||||
RealGraphDataRecordFeatures.NUM_INSPECTED_TWEETS_EWMA,
|
||||
RealGraphDataRecordFeatures.NUM_INSPECTED_TWEETS_NON_ZERO_DAYS,
|
||||
RealGraphDataRecordFeatures.NUM_INSPECTED_TWEETS_ELAPSED_DAYS,
|
||||
RealGraphDataRecordFeatures.NUM_INSPECTED_TWEETS_DAYS_SINCE_LAST
|
||||
)
|
||||
|
||||
val RecapLabelsForAggregation: Set[Feature.Binary] =
|
||||
Set(
|
||||
RecapFeatures.IS_FAVORITED,
|
||||
RecapFeatures.IS_RETWEETED,
|
||||
RecapFeatures.IS_CLICKED,
|
||||
RecapFeatures.IS_PROFILE_CLICKED,
|
||||
RecapFeatures.IS_OPEN_LINKED
|
||||
)
|
||||
|
||||
val DwellDuration: Set[Feature[_]] =
|
||||
Set(
|
||||
TimelinesSharedFeatures.DWELL_TIME_MS,
|
||||
)
|
||||
|
||||
val UserFeaturesV2: Set[Feature[_]] = RecapFeaturesForAggregation ++ Set(
|
||||
RecapFeatures.HAS_VINE,
|
||||
RecapFeatures.HAS_PERISCOPE,
|
||||
RecapFeatures.HAS_PRO_VIDEO,
|
||||
RecapFeatures.HAS_VISIBLE_LINK,
|
||||
RecapFeatures.BIDIRECTIONAL_FAV_COUNT,
|
||||
RecapFeatures.UNIDIRECTIONAL_FAV_COUNT,
|
||||
RecapFeatures.BIDIRECTIONAL_REPLY_COUNT,
|
||||
RecapFeatures.UNIDIRECTIONAL_REPLY_COUNT,
|
||||
RecapFeatures.BIDIRECTIONAL_RETWEET_COUNT,
|
||||
RecapFeatures.UNIDIRECTIONAL_RETWEET_COUNT,
|
||||
RecapFeatures.EMBEDS_URL_COUNT,
|
||||
RecapFeatures.EMBEDS_IMPRESSION_COUNT,
|
||||
RecapFeatures.VIDEO_VIEW_COUNT,
|
||||
RecapFeatures.IS_RETWEET,
|
||||
RecapFeatures.IS_REPLY,
|
||||
RecapFeatures.IS_EXTENDED_REPLY,
|
||||
RecapFeatures.HAS_LINK,
|
||||
RecapFeatures.HAS_TREND,
|
||||
RecapFeatures.LINK_LANGUAGE,
|
||||
RecapFeatures.NUM_HASHTAGS,
|
||||
RecapFeatures.NUM_MENTIONS,
|
||||
RecapFeatures.IS_SENSITIVE,
|
||||
RecapFeatures.HAS_MULTIPLE_MEDIA,
|
||||
RecapFeatures.USER_REP,
|
||||
RecapFeatures.FAV_COUNT_V2,
|
||||
RecapFeatures.RETWEET_COUNT_V2,
|
||||
RecapFeatures.REPLY_COUNT_V2,
|
||||
RecapFeatures.LINK_COUNT,
|
||||
EngagementDataRecordFeatures.InNetworkFavoritesCount,
|
||||
EngagementDataRecordFeatures.InNetworkRetweetsCount,
|
||||
EngagementDataRecordFeatures.InNetworkRepliesCount
|
||||
)
|
||||
|
||||
val UserAuthorFeaturesV2: Set[Feature[_]] = Set(
|
||||
RecapFeatures.HAS_IMAGE,
|
||||
RecapFeatures.HAS_VINE,
|
||||
RecapFeatures.HAS_PERISCOPE,
|
||||
RecapFeatures.HAS_PRO_VIDEO,
|
||||
RecapFeatures.HAS_VIDEO,
|
||||
RecapFeatures.HAS_CARD,
|
||||
RecapFeatures.HAS_NEWS,
|
||||
RecapFeatures.HAS_VISIBLE_LINK,
|
||||
RecapFeatures.REPLY_COUNT,
|
||||
RecapFeatures.FAV_COUNT,
|
||||
RecapFeatures.RETWEET_COUNT,
|
||||
RecapFeatures.BLENDER_SCORE,
|
||||
RecapFeatures.CONVERSATIONAL_COUNT,
|
||||
RecapFeatures.IS_BUSINESS_SCORE,
|
||||
RecapFeatures.CONTAINS_MEDIA,
|
||||
RecapFeatures.RETWEET_SEARCHER,
|
||||
RecapFeatures.REPLY_SEARCHER,
|
||||
RecapFeatures.MENTION_SEARCHER,
|
||||
RecapFeatures.REPLY_OTHER,
|
||||
RecapFeatures.RETWEET_OTHER,
|
||||
RecapFeatures.MATCH_UI_LANG,
|
||||
RecapFeatures.MATCH_SEARCHER_MAIN_LANG,
|
||||
RecapFeatures.MATCH_SEARCHER_LANGS,
|
||||
RecapFeatures.TWEET_COUNT_FROM_USER_IN_SNAPSHOT,
|
||||
RecapFeatures.TEXT_SCORE,
|
||||
RecapFeatures.BIDIRECTIONAL_FAV_COUNT,
|
||||
RecapFeatures.UNIDIRECTIONAL_FAV_COUNT,
|
||||
RecapFeatures.BIDIRECTIONAL_REPLY_COUNT,
|
||||
RecapFeatures.UNIDIRECTIONAL_REPLY_COUNT,
|
||||
RecapFeatures.BIDIRECTIONAL_RETWEET_COUNT,
|
||||
RecapFeatures.UNIDIRECTIONAL_RETWEET_COUNT,
|
||||
RecapFeatures.EMBEDS_URL_COUNT,
|
||||
RecapFeatures.EMBEDS_IMPRESSION_COUNT,
|
||||
RecapFeatures.VIDEO_VIEW_COUNT,
|
||||
RecapFeatures.IS_RETWEET,
|
||||
RecapFeatures.IS_REPLY,
|
||||
RecapFeatures.HAS_LINK,
|
||||
RecapFeatures.HAS_TREND,
|
||||
RecapFeatures.LINK_LANGUAGE,
|
||||
RecapFeatures.NUM_HASHTAGS,
|
||||
RecapFeatures.NUM_MENTIONS,
|
||||
RecapFeatures.IS_SENSITIVE,
|
||||
RecapFeatures.HAS_MULTIPLE_MEDIA,
|
||||
RecapFeatures.FAV_COUNT_V2,
|
||||
RecapFeatures.RETWEET_COUNT_V2,
|
||||
RecapFeatures.REPLY_COUNT_V2,
|
||||
RecapFeatures.LINK_COUNT,
|
||||
EngagementDataRecordFeatures.InNetworkFavoritesCount,
|
||||
EngagementDataRecordFeatures.InNetworkRetweetsCount,
|
||||
EngagementDataRecordFeatures.InNetworkRepliesCount
|
||||
)
|
||||
|
||||
val UserAuthorFeaturesV2Count: Set[Feature[_]] = Set(
|
||||
RecapFeatures.HAS_IMAGE,
|
||||
RecapFeatures.HAS_VINE,
|
||||
RecapFeatures.HAS_PERISCOPE,
|
||||
RecapFeatures.HAS_PRO_VIDEO,
|
||||
RecapFeatures.HAS_VIDEO,
|
||||
RecapFeatures.HAS_CARD,
|
||||
RecapFeatures.HAS_NEWS,
|
||||
RecapFeatures.HAS_VISIBLE_LINK,
|
||||
RecapFeatures.FAV_COUNT,
|
||||
RecapFeatures.CONTAINS_MEDIA,
|
||||
RecapFeatures.RETWEET_SEARCHER,
|
||||
RecapFeatures.REPLY_SEARCHER,
|
||||
RecapFeatures.MENTION_SEARCHER,
|
||||
RecapFeatures.REPLY_OTHER,
|
||||
RecapFeatures.RETWEET_OTHER,
|
||||
RecapFeatures.MATCH_UI_LANG,
|
||||
RecapFeatures.MATCH_SEARCHER_MAIN_LANG,
|
||||
RecapFeatures.MATCH_SEARCHER_LANGS,
|
||||
RecapFeatures.IS_RETWEET,
|
||||
RecapFeatures.IS_REPLY,
|
||||
RecapFeatures.HAS_LINK,
|
||||
RecapFeatures.HAS_TREND,
|
||||
RecapFeatures.IS_SENSITIVE,
|
||||
RecapFeatures.HAS_MULTIPLE_MEDIA,
|
||||
EngagementDataRecordFeatures.InNetworkFavoritesCount
|
||||
)
|
||||
|
||||
val UserTopicFeaturesV2Count: Set[Feature[_]] = Set(
|
||||
RecapFeatures.HAS_IMAGE,
|
||||
RecapFeatures.HAS_VIDEO,
|
||||
RecapFeatures.HAS_CARD,
|
||||
RecapFeatures.HAS_NEWS,
|
||||
RecapFeatures.FAV_COUNT,
|
||||
RecapFeatures.CONTAINS_MEDIA,
|
||||
RecapFeatures.RETWEET_SEARCHER,
|
||||
RecapFeatures.REPLY_SEARCHER,
|
||||
RecapFeatures.MENTION_SEARCHER,
|
||||
RecapFeatures.REPLY_OTHER,
|
||||
RecapFeatures.RETWEET_OTHER,
|
||||
RecapFeatures.MATCH_UI_LANG,
|
||||
RecapFeatures.MATCH_SEARCHER_MAIN_LANG,
|
||||
RecapFeatures.MATCH_SEARCHER_LANGS,
|
||||
RecapFeatures.IS_RETWEET,
|
||||
RecapFeatures.IS_REPLY,
|
||||
RecapFeatures.HAS_LINK,
|
||||
RecapFeatures.HAS_TREND,
|
||||
RecapFeatures.IS_SENSITIVE,
|
||||
EngagementDataRecordFeatures.InNetworkFavoritesCount,
|
||||
EngagementDataRecordFeatures.InNetworkRetweetsCount,
|
||||
TimelinesSharedFeatures.NUM_CAPS,
|
||||
TimelinesSharedFeatures.ASPECT_RATIO_DEN,
|
||||
TimelinesSharedFeatures.NUM_NEWLINES,
|
||||
TimelinesSharedFeatures.IS_360,
|
||||
TimelinesSharedFeatures.IS_MANAGED,
|
||||
TimelinesSharedFeatures.IS_MONETIZABLE,
|
||||
TimelinesSharedFeatures.HAS_SELECTED_PREVIEW_IMAGE,
|
||||
TimelinesSharedFeatures.HAS_TITLE,
|
||||
TimelinesSharedFeatures.HAS_DESCRIPTION,
|
||||
TimelinesSharedFeatures.HAS_VISIT_SITE_CALL_TO_ACTION,
|
||||
TimelinesSharedFeatures.HAS_WATCH_NOW_CALL_TO_ACTION
|
||||
)
|
||||
|
||||
val UserFeaturesV5Continuous: Set[Feature[_]] = Set(
|
||||
TimelinesSharedFeatures.QUOTE_COUNT,
|
||||
TimelinesSharedFeatures.VISIBLE_TOKEN_RATIO,
|
||||
TimelinesSharedFeatures.WEIGHTED_FAV_COUNT,
|
||||
TimelinesSharedFeatures.WEIGHTED_RETWEET_COUNT,
|
||||
TimelinesSharedFeatures.WEIGHTED_REPLY_COUNT,
|
||||
TimelinesSharedFeatures.WEIGHTED_QUOTE_COUNT,
|
||||
TimelinesSharedFeatures.EMBEDS_IMPRESSION_COUNT_V2,
|
||||
TimelinesSharedFeatures.EMBEDS_URL_COUNT_V2,
|
||||
TimelinesSharedFeatures.DECAYED_FAVORITE_COUNT,
|
||||
TimelinesSharedFeatures.DECAYED_RETWEET_COUNT,
|
||||
TimelinesSharedFeatures.DECAYED_REPLY_COUNT,
|
||||
TimelinesSharedFeatures.DECAYED_QUOTE_COUNT,
|
||||
TimelinesSharedFeatures.FAKE_FAVORITE_COUNT,
|
||||
TimelinesSharedFeatures.FAKE_RETWEET_COUNT,
|
||||
TimelinesSharedFeatures.FAKE_REPLY_COUNT,
|
||||
TimelinesSharedFeatures.FAKE_QUOTE_COUNT,
|
||||
TimeDataRecordFeatures.LAST_FAVORITE_SINCE_CREATION_HRS,
|
||||
TimeDataRecordFeatures.LAST_RETWEET_SINCE_CREATION_HRS,
|
||||
TimeDataRecordFeatures.LAST_REPLY_SINCE_CREATION_HRS,
|
||||
TimeDataRecordFeatures.LAST_QUOTE_SINCE_CREATION_HRS,
|
||||
TimeDataRecordFeatures.TIME_SINCE_LAST_FAVORITE_HRS,
|
||||
TimeDataRecordFeatures.TIME_SINCE_LAST_RETWEET_HRS,
|
||||
TimeDataRecordFeatures.TIME_SINCE_LAST_REPLY_HRS,
|
||||
TimeDataRecordFeatures.TIME_SINCE_LAST_QUOTE_HRS
|
||||
)
|
||||
|
||||
val UserFeaturesV5Boolean: Set[Feature[_]] = Set(
|
||||
TimelinesSharedFeatures.LABEL_ABUSIVE_FLAG,
|
||||
TimelinesSharedFeatures.LABEL_ABUSIVE_HI_RCL_FLAG,
|
||||
TimelinesSharedFeatures.LABEL_DUP_CONTENT_FLAG,
|
||||
TimelinesSharedFeatures.LABEL_NSFW_HI_PRC_FLAG,
|
||||
TimelinesSharedFeatures.LABEL_NSFW_HI_RCL_FLAG,
|
||||
TimelinesSharedFeatures.LABEL_SPAM_FLAG,
|
||||
TimelinesSharedFeatures.LABEL_SPAM_HI_RCL_FLAG,
|
||||
TimelinesSharedFeatures.PERISCOPE_EXISTS,
|
||||
TimelinesSharedFeatures.PERISCOPE_IS_LIVE,
|
||||
TimelinesSharedFeatures.PERISCOPE_HAS_BEEN_FEATURED,
|
||||
TimelinesSharedFeatures.PERISCOPE_IS_CURRENTLY_FEATURED,
|
||||
TimelinesSharedFeatures.PERISCOPE_IS_FROM_QUALITY_SOURCE,
|
||||
TimelinesSharedFeatures.HAS_QUOTE
|
||||
)
|
||||
|
||||
val UserAuthorFeaturesV5: Set[Feature[_]] = Set(
|
||||
TimelinesSharedFeatures.HAS_QUOTE,
|
||||
TimelinesSharedFeatures.LABEL_ABUSIVE_FLAG,
|
||||
TimelinesSharedFeatures.LABEL_ABUSIVE_HI_RCL_FLAG,
|
||||
TimelinesSharedFeatures.LABEL_DUP_CONTENT_FLAG,
|
||||
TimelinesSharedFeatures.LABEL_NSFW_HI_PRC_FLAG,
|
||||
TimelinesSharedFeatures.LABEL_NSFW_HI_RCL_FLAG,
|
||||
TimelinesSharedFeatures.LABEL_SPAM_FLAG,
|
||||
TimelinesSharedFeatures.LABEL_SPAM_HI_RCL_FLAG
|
||||
)
|
||||
|
||||
val UserTweetSourceFeaturesV1Continuous: Set[Feature[_]] = Set(
|
||||
TimelinesSharedFeatures.NUM_CAPS,
|
||||
TimelinesSharedFeatures.NUM_WHITESPACES,
|
||||
TimelinesSharedFeatures.TWEET_LENGTH,
|
||||
TimelinesSharedFeatures.ASPECT_RATIO_DEN,
|
||||
TimelinesSharedFeatures.ASPECT_RATIO_NUM,
|
||||
TimelinesSharedFeatures.BIT_RATE,
|
||||
TimelinesSharedFeatures.HEIGHT_1,
|
||||
TimelinesSharedFeatures.HEIGHT_2,
|
||||
TimelinesSharedFeatures.HEIGHT_3,
|
||||
TimelinesSharedFeatures.HEIGHT_4,
|
||||
TimelinesSharedFeatures.VIDEO_DURATION,
|
||||
TimelinesSharedFeatures.WIDTH_1,
|
||||
TimelinesSharedFeatures.WIDTH_2,
|
||||
TimelinesSharedFeatures.WIDTH_3,
|
||||
TimelinesSharedFeatures.WIDTH_4,
|
||||
TimelinesSharedFeatures.NUM_MEDIA_TAGS
|
||||
)
|
||||
|
||||
val UserTweetSourceFeaturesV1Boolean: Set[Feature[_]] = Set(
|
||||
TimelinesSharedFeatures.HAS_QUESTION,
|
||||
TimelinesSharedFeatures.RESIZE_METHOD_1,
|
||||
TimelinesSharedFeatures.RESIZE_METHOD_2,
|
||||
TimelinesSharedFeatures.RESIZE_METHOD_3,
|
||||
TimelinesSharedFeatures.RESIZE_METHOD_4
|
||||
)
|
||||
|
||||
val UserTweetSourceFeaturesV2Continuous: Set[Feature[_]] = Set(
|
||||
TimelinesSharedFeatures.NUM_EMOJIS,
|
||||
TimelinesSharedFeatures.NUM_EMOTICONS,
|
||||
TimelinesSharedFeatures.NUM_NEWLINES,
|
||||
TimelinesSharedFeatures.NUM_STICKERS,
|
||||
TimelinesSharedFeatures.NUM_FACES,
|
||||
TimelinesSharedFeatures.NUM_COLOR_PALLETTE_ITEMS,
|
||||
TimelinesSharedFeatures.VIEW_COUNT,
|
||||
TimelinesSharedFeatures.TWEET_LENGTH_TYPE
|
||||
)
|
||||
|
||||
val UserTweetSourceFeaturesV2Boolean: Set[Feature[_]] = Set(
|
||||
TimelinesSharedFeatures.IS_360,
|
||||
TimelinesSharedFeatures.IS_MANAGED,
|
||||
TimelinesSharedFeatures.IS_MONETIZABLE,
|
||||
TimelinesSharedFeatures.IS_EMBEDDABLE,
|
||||
TimelinesSharedFeatures.HAS_SELECTED_PREVIEW_IMAGE,
|
||||
TimelinesSharedFeatures.HAS_TITLE,
|
||||
TimelinesSharedFeatures.HAS_DESCRIPTION,
|
||||
TimelinesSharedFeatures.HAS_VISIT_SITE_CALL_TO_ACTION,
|
||||
TimelinesSharedFeatures.HAS_WATCH_NOW_CALL_TO_ACTION
|
||||
)
|
||||
|
||||
val UserAuthorTweetSourceFeaturesV1: Set[Feature[_]] = Set(
|
||||
TimelinesSharedFeatures.HAS_QUESTION,
|
||||
TimelinesSharedFeatures.TWEET_LENGTH,
|
||||
TimelinesSharedFeatures.VIDEO_DURATION,
|
||||
TimelinesSharedFeatures.NUM_MEDIA_TAGS
|
||||
)
|
||||
|
||||
val UserAuthorTweetSourceFeaturesV2: Set[Feature[_]] = Set(
|
||||
TimelinesSharedFeatures.NUM_CAPS,
|
||||
TimelinesSharedFeatures.NUM_WHITESPACES,
|
||||
TimelinesSharedFeatures.ASPECT_RATIO_DEN,
|
||||
TimelinesSharedFeatures.ASPECT_RATIO_NUM,
|
||||
TimelinesSharedFeatures.BIT_RATE,
|
||||
TimelinesSharedFeatures.TWEET_LENGTH_TYPE,
|
||||
TimelinesSharedFeatures.NUM_EMOJIS,
|
||||
TimelinesSharedFeatures.NUM_EMOTICONS,
|
||||
TimelinesSharedFeatures.NUM_NEWLINES,
|
||||
TimelinesSharedFeatures.NUM_STICKERS,
|
||||
TimelinesSharedFeatures.NUM_FACES,
|
||||
TimelinesSharedFeatures.IS_360,
|
||||
TimelinesSharedFeatures.IS_MANAGED,
|
||||
TimelinesSharedFeatures.IS_MONETIZABLE,
|
||||
TimelinesSharedFeatures.HAS_SELECTED_PREVIEW_IMAGE,
|
||||
TimelinesSharedFeatures.HAS_TITLE,
|
||||
TimelinesSharedFeatures.HAS_DESCRIPTION,
|
||||
TimelinesSharedFeatures.HAS_VISIT_SITE_CALL_TO_ACTION,
|
||||
TimelinesSharedFeatures.HAS_WATCH_NOW_CALL_TO_ACTION
|
||||
)
|
||||
|
||||
val UserAuthorTweetSourceFeaturesV2Count: Set[Feature[_]] = Set(
|
||||
TimelinesSharedFeatures.NUM_CAPS,
|
||||
TimelinesSharedFeatures.ASPECT_RATIO_DEN,
|
||||
TimelinesSharedFeatures.NUM_NEWLINES,
|
||||
TimelinesSharedFeatures.IS_360,
|
||||
TimelinesSharedFeatures.IS_MANAGED,
|
||||
TimelinesSharedFeatures.IS_MONETIZABLE,
|
||||
TimelinesSharedFeatures.HAS_SELECTED_PREVIEW_IMAGE,
|
||||
TimelinesSharedFeatures.HAS_TITLE,
|
||||
TimelinesSharedFeatures.HAS_DESCRIPTION,
|
||||
TimelinesSharedFeatures.HAS_VISIT_SITE_CALL_TO_ACTION,
|
||||
TimelinesSharedFeatures.HAS_WATCH_NOW_CALL_TO_ACTION
|
||||
)
|
||||
|
||||
val LabelsV2: Set[Feature.Binary] = RecapLabelsForAggregation ++ Set(
|
||||
RecapFeatures.IS_REPLIED,
|
||||
RecapFeatures.IS_PHOTO_EXPANDED,
|
||||
RecapFeatures.IS_VIDEO_PLAYBACK_50
|
||||
)
|
||||
|
||||
val TwitterWideFeatures: Set[Feature[_]] = Set(
|
||||
RecapFeatures.IS_REPLY,
|
||||
TimelinesSharedFeatures.HAS_QUOTE,
|
||||
RecapFeatures.HAS_MENTION,
|
||||
RecapFeatures.HAS_HASHTAG,
|
||||
RecapFeatures.HAS_LINK,
|
||||
RecapFeatures.HAS_CARD,
|
||||
RecapFeatures.CONTAINS_MEDIA
|
||||
)
|
||||
|
||||
val TwitterWideLabels: Set[Feature.Binary] = Set(
|
||||
RecapFeatures.IS_FAVORITED,
|
||||
RecapFeatures.IS_RETWEETED,
|
||||
RecapFeatures.IS_REPLIED
|
||||
)
|
||||
|
||||
val ReciprocalLabels: Set[Feature.Binary] = Set(
|
||||
RecapFeatures.IS_REPLIED_REPLY_IMPRESSED_BY_AUTHOR,
|
||||
RecapFeatures.IS_REPLIED_REPLY_REPLIED_BY_AUTHOR,
|
||||
RecapFeatures.IS_REPLIED_REPLY_FAVORITED_BY_AUTHOR
|
||||
)
|
||||
|
||||
val NegativeEngagementLabels: Set[Feature.Binary] = Set(
|
||||
RecapFeatures.IS_REPORT_TWEET_CLICKED,
|
||||
RecapFeatures.IS_BLOCK_CLICKED,
|
||||
RecapFeatures.IS_MUTE_CLICKED,
|
||||
RecapFeatures.IS_DONT_LIKE
|
||||
)
|
||||
|
||||
val GoodClickLabels: Set[Feature.Binary] = Set(
|
||||
RecapFeatures.IS_GOOD_CLICKED_CONVO_DESC_V1,
|
||||
RecapFeatures.IS_GOOD_CLICKED_CONVO_DESC_V2,
|
||||
)
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates
|
||||
|
||||
import com.twitter.ml.api.Feature
|
||||
import com.twitter.timelines.prediction.features.engagement_features.EngagementDataRecordFeatures
|
||||
import com.twitter.timelines.prediction.features.itl.ITLFeatures
|
||||
|
||||
object RectweetUserFeatureAggregation {
|
||||
val RectweetLabelsForAggregation: Set[Feature.Binary] =
|
||||
Set(
|
||||
ITLFeatures.IS_FAVORITED,
|
||||
ITLFeatures.IS_RETWEETED,
|
||||
ITLFeatures.IS_REPLIED,
|
||||
ITLFeatures.IS_CLICKED,
|
||||
ITLFeatures.IS_PROFILE_CLICKED,
|
||||
ITLFeatures.IS_OPEN_LINKED,
|
||||
ITLFeatures.IS_PHOTO_EXPANDED,
|
||||
ITLFeatures.IS_VIDEO_PLAYBACK_50
|
||||
)
|
||||
|
||||
val TweetFeatures: Set[Feature[_]] = Set(
|
||||
ITLFeatures.HAS_IMAGE,
|
||||
ITLFeatures.HAS_CARD,
|
||||
ITLFeatures.HAS_NEWS,
|
||||
ITLFeatures.REPLY_COUNT,
|
||||
ITLFeatures.FAV_COUNT,
|
||||
ITLFeatures.REPLY_COUNT,
|
||||
ITLFeatures.RETWEET_COUNT,
|
||||
ITLFeatures.MATCHES_UI_LANG,
|
||||
ITLFeatures.MATCHES_SEARCHER_MAIN_LANG,
|
||||
ITLFeatures.MATCHES_SEARCHER_LANGS,
|
||||
ITLFeatures.TEXT_SCORE,
|
||||
ITLFeatures.LINK_LANGUAGE,
|
||||
ITLFeatures.NUM_HASHTAGS,
|
||||
ITLFeatures.NUM_MENTIONS,
|
||||
ITLFeatures.IS_SENSITIVE,
|
||||
ITLFeatures.HAS_VIDEO,
|
||||
ITLFeatures.HAS_LINK,
|
||||
ITLFeatures.HAS_VISIBLE_LINK,
|
||||
EngagementDataRecordFeatures.InNetworkFavoritesCount
|
||||
// nice to have, but currently not hydrated in the RecommendedTweet payload
|
||||
//EngagementDataRecordFeatures.InNetworkRetweetsCount,
|
||||
//EngagementDataRecordFeatures.InNetworkRepliesCount
|
||||
)
|
||||
|
||||
val ReciprocalLabels: Set[Feature.Binary] = Set(
|
||||
ITLFeatures.IS_REPLIED_REPLY_IMPRESSED_BY_AUTHOR,
|
||||
ITLFeatures.IS_REPLIED_REPLY_REPLIED_BY_AUTHOR,
|
||||
ITLFeatures.IS_REPLIED_REPLY_FAVORITED_BY_AUTHOR,
|
||||
ITLFeatures.IS_REPLIED_REPLY_RETWEETED_BY_AUTHOR,
|
||||
ITLFeatures.IS_REPLIED_REPLY_QUOTED_BY_AUTHOR
|
||||
)
|
||||
}
|
|
@ -0,0 +1,80 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates
|
||||
|
||||
import com.twitter.dal.client.dataset.KeyValDALDataset
|
||||
import com.twitter.ml.api.DataRecord
|
||||
import com.twitter.ml.api.FeatureContext
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval
|
||||
import com.twitter.summingbird.batch.BatchID
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion.CombineCountsPolicy
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregateStore
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.OfflineAggregateDataRecordStore
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object TimelinesAggregationConfig extends TimelinesAggregationConfigTrait {
|
||||
override def outputHdfsPath: String = "/user/timelines/processed/aggregates_v2"
|
||||
|
||||
def storeToDatasetMap: Map[String, KeyValDALDataset[
|
||||
keyval.KeyVal[AggregationKey, (BatchID, DataRecord)]
|
||||
]] = Map(
|
||||
AuthorTopicAggregateStore -> AuthorTopicAggregatesScalaDataset,
|
||||
UserTopicAggregateStore -> UserTopicAggregatesScalaDataset,
|
||||
UserInferredTopicAggregateStore -> UserInferredTopicAggregatesScalaDataset,
|
||||
UserAggregateStore -> UserAggregatesScalaDataset,
|
||||
UserAuthorAggregateStore -> UserAuthorAggregatesScalaDataset,
|
||||
UserOriginalAuthorAggregateStore -> UserOriginalAuthorAggregatesScalaDataset,
|
||||
OriginalAuthorAggregateStore -> OriginalAuthorAggregatesScalaDataset,
|
||||
UserEngagerAggregateStore -> UserEngagerAggregatesScalaDataset,
|
||||
UserMentionAggregateStore -> UserMentionAggregatesScalaDataset,
|
||||
TwitterWideUserAggregateStore -> TwitterWideUserAggregatesScalaDataset,
|
||||
TwitterWideUserAuthorAggregateStore -> TwitterWideUserAuthorAggregatesScalaDataset,
|
||||
UserRequestHourAggregateStore -> UserRequestHourAggregatesScalaDataset,
|
||||
UserRequestDowAggregateStore -> UserRequestDowAggregatesScalaDataset,
|
||||
UserListAggregateStore -> UserListAggregatesScalaDataset,
|
||||
UserMediaUnderstandingAnnotationAggregateStore -> UserMediaUnderstandingAnnotationAggregatesScalaDataset,
|
||||
)
|
||||
|
||||
override def mkPhysicalStore(store: AggregateStore): AggregateStore = store match {
|
||||
case s: OfflineAggregateDataRecordStore =>
|
||||
s.toOfflineAggregateDataRecordStoreWithDAL(storeToDatasetMap(s.name))
|
||||
case _ => throw new IllegalArgumentException("Unsupported logical dataset type.")
|
||||
}
|
||||
|
||||
object CombineCountPolicies {
|
||||
val EngagerCountsPolicy: CombineCountsPolicy = mkCountsPolicy("user_engager_aggregate")
|
||||
val EngagerGoodClickCountsPolicy: CombineCountsPolicy = mkCountsPolicy(
|
||||
"user_engager_good_click_aggregate")
|
||||
val RectweetEngagerCountsPolicy: CombineCountsPolicy =
|
||||
mkCountsPolicy("rectweet_user_engager_aggregate")
|
||||
val MentionCountsPolicy: CombineCountsPolicy = mkCountsPolicy("user_mention_aggregate")
|
||||
val RectweetSimclustersTweetCountsPolicy: CombineCountsPolicy =
|
||||
mkCountsPolicy("rectweet_user_simcluster_tweet_aggregate")
|
||||
val UserInferredTopicCountsPolicy: CombineCountsPolicy =
|
||||
mkCountsPolicy("user_inferred_topic_aggregate")
|
||||
val UserInferredTopicV2CountsPolicy: CombineCountsPolicy =
|
||||
mkCountsPolicy("user_inferred_topic_aggregate_v2")
|
||||
val UserMediaUnderstandingAnnotationCountsPolicy: CombineCountsPolicy =
|
||||
mkCountsPolicy("user_media_annotation_aggregate")
|
||||
|
||||
private[this] def mkCountsPolicy(prefix: String): CombineCountsPolicy = {
|
||||
val features = TimelinesAggregationConfig.aggregatesToCompute
|
||||
.filter(_.aggregatePrefix == prefix)
|
||||
.flatMap(_.allOutputFeatures)
|
||||
CombineCountsPolicy(
|
||||
topK = 2,
|
||||
aggregateContextToPrecompute = new FeatureContext(features.asJava),
|
||||
hardLimit = Some(20)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
object TimelinesAggregationCanaryConfig extends TimelinesAggregationConfigTrait {
|
||||
override def outputHdfsPath: String = "/user/timelines/canaries/processed/aggregates_v2"
|
||||
|
||||
override def mkPhysicalStore(store: AggregateStore): AggregateStore = store match {
|
||||
case s: OfflineAggregateDataRecordStore =>
|
||||
s.toOfflineAggregateDataRecordStoreWithDAL(dalDataset = AggregatesCanaryScalaDataset)
|
||||
case _ => throw new IllegalArgumentException("Unsupported logical dataset type.")
|
||||
}
|
||||
}
|
|
@ -0,0 +1,579 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates
|
||||
|
||||
import com.twitter.conversions.DurationOps._
|
||||
import com.twitter.ml.api.constant.SharedFeatures.AUTHOR_ID
|
||||
import com.twitter.ml.api.constant.SharedFeatures.USER_ID
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework._
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics._
|
||||
import com.twitter.timelines.data_processing.ml_util.transforms.DownsampleTransform
|
||||
import com.twitter.timelines.data_processing.ml_util.transforms.RichRemoveAuthorIdZero
|
||||
import com.twitter.timelines.data_processing.ml_util.transforms.RichRemoveUserIdZero
|
||||
import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures
|
||||
import com.twitter.timelines.prediction.features.engagement_features.EngagementDataRecordFeatures
|
||||
import com.twitter.timelines.prediction.features.engagement_features.EngagementDataRecordFeatures.RichUnifyPublicEngagersTransform
|
||||
import com.twitter.timelines.prediction.features.list_features.ListFeatures
|
||||
import com.twitter.timelines.prediction.features.recap.RecapFeatures
|
||||
import com.twitter.timelines.prediction.features.request_context.RequestContextFeatures
|
||||
import com.twitter.timelines.prediction.features.semantic_core_features.SemanticCoreFeatures
|
||||
import com.twitter.timelines.prediction.transform.filter.FilterInNetworkTransform
|
||||
import com.twitter.timelines.prediction.transform.filter.FilterImageTweetTransform
|
||||
import com.twitter.timelines.prediction.transform.filter.FilterVideoTweetTransform
|
||||
import com.twitter.timelines.prediction.transform.filter.FilterOutImageVideoTweetTransform
|
||||
import com.twitter.util.Duration
|
||||
|
||||
trait TimelinesAggregationConfigDetails extends Serializable {
|
||||
|
||||
import TimelinesAggregationSources._
|
||||
|
||||
def outputHdfsPath: String
|
||||
|
||||
/**
|
||||
* Converts the given logical store to a physical store. The reason we do not specify the
|
||||
* physical store directly with the [[AggregateGroup]] is because of a cyclic dependency when
|
||||
* create physical stores that are DalDataset with PersonalDataType annotations derived from
|
||||
* the [[AggregateGroup]].
|
||||
*
|
||||
*/
|
||||
def mkPhysicalStore(store: AggregateStore): AggregateStore
|
||||
|
||||
def defaultMaxKvSourceFailures: Int = 100
|
||||
|
||||
val timelinesOfflineAggregateSink = new OfflineStoreCommonConfig {
|
||||
override def apply(startDate: String) = OfflineAggregateStoreCommonConfig(
|
||||
outputHdfsPathPrefix = outputHdfsPath,
|
||||
dummyAppId = "timelines_aggregates_v2_ro",
|
||||
dummyDatasetPrefix = "timelines_aggregates_v2_ro",
|
||||
startDate = startDate
|
||||
)
|
||||
}
|
||||
|
||||
val UserAggregateStore = "user_aggregates"
|
||||
val UserAuthorAggregateStore = "user_author_aggregates"
|
||||
val UserOriginalAuthorAggregateStore = "user_original_author_aggregates"
|
||||
val OriginalAuthorAggregateStore = "original_author_aggregates"
|
||||
val UserEngagerAggregateStore = "user_engager_aggregates"
|
||||
val UserMentionAggregateStore = "user_mention_aggregates"
|
||||
val TwitterWideUserAggregateStore = "twitter_wide_user_aggregates"
|
||||
val TwitterWideUserAuthorAggregateStore = "twitter_wide_user_author_aggregates"
|
||||
val UserRequestHourAggregateStore = "user_request_hour_aggregates"
|
||||
val UserRequestDowAggregateStore = "user_request_dow_aggregates"
|
||||
val UserListAggregateStore = "user_list_aggregates"
|
||||
val AuthorTopicAggregateStore = "author_topic_aggregates"
|
||||
val UserTopicAggregateStore = "user_topic_aggregates"
|
||||
val UserInferredTopicAggregateStore = "user_inferred_topic_aggregates"
|
||||
val UserMediaUnderstandingAnnotationAggregateStore =
|
||||
"user_media_understanding_annotation_aggregates"
|
||||
val AuthorCountryCodeAggregateStore = "author_country_code_aggregates"
|
||||
val OriginalAuthorCountryCodeAggregateStore = "original_author_country_code_aggregates"
|
||||
|
||||
/**
|
||||
* Step 3: Configure all aggregates to compute.
|
||||
* Note that different subsets of aggregates in this list
|
||||
* can be launched by different summingbird job instances.
|
||||
* Any given job can be responsible for a set of AggregateGroup
|
||||
* configs whose outputStores share the same exact startDate.
|
||||
* AggregateGroups that do not share the same inputSource,
|
||||
* outputStore or startDate MUST be launched using different
|
||||
* summingbird jobs and passed in a different --start-time argument
|
||||
* See science/scalding/mesos/timelines/prod.yaml for an example
|
||||
* of how to configure your own job.
|
||||
*/
|
||||
val negativeDownsampleTransform =
|
||||
DownsampleTransform(
|
||||
negativeSamplingRate = 0.03,
|
||||
keepLabels = RecapUserFeatureAggregation.LabelsV2)
|
||||
val negativeRecTweetDownsampleTransform = DownsampleTransform(
|
||||
negativeSamplingRate = 0.03,
|
||||
keepLabels = RectweetUserFeatureAggregation.RectweetLabelsForAggregation
|
||||
)
|
||||
|
||||
val userAggregatesV2: AggregateGroup =
|
||||
AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "user_aggregate_v2",
|
||||
preTransforms = Seq(RichRemoveUserIdZero), /* Eliminates reducer skew */
|
||||
keys = Set(USER_ID),
|
||||
features = RecapUserFeatureAggregation.UserFeaturesV2,
|
||||
labels = RecapUserFeatureAggregation.LabelsV2,
|
||||
metrics = Set(CountMetric, SumMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = UserAggregateStore,
|
||||
startDate = "2016-07-15 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
))
|
||||
)
|
||||
|
||||
val userAuthorAggregatesV2: Set[AggregateGroup] = {
|
||||
|
||||
/**
|
||||
* NOTE: We need to remove records from out-of-network authors from the recap input
|
||||
* records (which now include out-of-network records as well after merging recap and
|
||||
* rectweet models) that are used to compute user-author aggregates. This is necessary
|
||||
* to limit the growth rate of user-author aggregates.
|
||||
*/
|
||||
val allFeatureAggregates = Set(
|
||||
AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "user_author_aggregate_v2",
|
||||
preTransforms = Seq(FilterInNetworkTransform, RichRemoveUserIdZero),
|
||||
keys = Set(USER_ID, AUTHOR_ID),
|
||||
features = RecapUserFeatureAggregation.UserAuthorFeaturesV2,
|
||||
labels = RecapUserFeatureAggregation.LabelsV2,
|
||||
metrics = Set(SumMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = UserAuthorAggregateStore,
|
||||
startDate = "2016-07-15 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
))
|
||||
)
|
||||
)
|
||||
|
||||
val countAggregates: Set[AggregateGroup] = Set(
|
||||
AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "user_author_aggregate_v2",
|
||||
preTransforms = Seq(FilterInNetworkTransform, RichRemoveUserIdZero),
|
||||
keys = Set(USER_ID, AUTHOR_ID),
|
||||
features = RecapUserFeatureAggregation.UserAuthorFeaturesV2Count,
|
||||
labels = RecapUserFeatureAggregation.LabelsV2,
|
||||
metrics = Set(CountMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = UserAuthorAggregateStore,
|
||||
startDate = "2016-07-15 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
))
|
||||
)
|
||||
)
|
||||
|
||||
allFeatureAggregates ++ countAggregates
|
||||
}
|
||||
|
||||
val userAggregatesV5Continuous: AggregateGroup =
|
||||
AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "user_aggregate_v5.continuous",
|
||||
preTransforms = Seq(RichRemoveUserIdZero),
|
||||
keys = Set(USER_ID),
|
||||
features = RecapUserFeatureAggregation.UserFeaturesV5Continuous,
|
||||
labels = RecapUserFeatureAggregation.LabelsV2,
|
||||
metrics = Set(CountMetric, SumMetric, SumSqMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = UserAggregateStore,
|
||||
startDate = "2016-07-15 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
))
|
||||
)
|
||||
|
||||
val userAuthorAggregatesV5: AggregateGroup =
|
||||
AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "user_author_aggregate_v5",
|
||||
preTransforms = Seq(FilterInNetworkTransform, RichRemoveUserIdZero),
|
||||
keys = Set(USER_ID, AUTHOR_ID),
|
||||
features = RecapUserFeatureAggregation.UserAuthorFeaturesV5,
|
||||
labels = RecapUserFeatureAggregation.LabelsV2,
|
||||
metrics = Set(CountMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = UserAuthorAggregateStore,
|
||||
startDate = "2016-07-15 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
))
|
||||
)
|
||||
|
||||
val tweetSourceUserAuthorAggregatesV1: AggregateGroup =
|
||||
AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "user_author_aggregate_tweetsource_v1",
|
||||
preTransforms = Seq(FilterInNetworkTransform, RichRemoveUserIdZero),
|
||||
keys = Set(USER_ID, AUTHOR_ID),
|
||||
features = RecapUserFeatureAggregation.UserAuthorTweetSourceFeaturesV1,
|
||||
labels = RecapUserFeatureAggregation.LabelsV2,
|
||||
metrics = Set(CountMetric, SumMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = UserAuthorAggregateStore,
|
||||
startDate = "2016-07-15 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
))
|
||||
)
|
||||
|
||||
val userEngagerAggregates = AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "user_engager_aggregate",
|
||||
keys = Set(USER_ID, EngagementDataRecordFeatures.PublicEngagementUserIds),
|
||||
features = Set.empty,
|
||||
labels = RecapUserFeatureAggregation.LabelsV2,
|
||||
metrics = Set(CountMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = UserEngagerAggregateStore,
|
||||
startDate = "2016-09-02 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
)),
|
||||
preTransforms = Seq(
|
||||
RichRemoveUserIdZero,
|
||||
RichUnifyPublicEngagersTransform
|
||||
)
|
||||
)
|
||||
|
||||
val userMentionAggregates = AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
preTransforms = Seq(RichRemoveUserIdZero), /* Eliminates reducer skew */
|
||||
aggregatePrefix = "user_mention_aggregate",
|
||||
keys = Set(USER_ID, RecapFeatures.MENTIONED_SCREEN_NAMES),
|
||||
features = Set.empty,
|
||||
labels = RecapUserFeatureAggregation.LabelsV2,
|
||||
metrics = Set(CountMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = UserMentionAggregateStore,
|
||||
startDate = "2017-03-01 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
)),
|
||||
includeAnyLabel = false
|
||||
)
|
||||
|
||||
val twitterWideUserAggregates = AggregateGroup(
|
||||
inputSource = timelinesDailyTwitterWideSource,
|
||||
preTransforms = Seq(RichRemoveUserIdZero), /* Eliminates reducer skew */
|
||||
aggregatePrefix = "twitter_wide_user_aggregate",
|
||||
keys = Set(USER_ID),
|
||||
features = RecapUserFeatureAggregation.TwitterWideFeatures,
|
||||
labels = RecapUserFeatureAggregation.TwitterWideLabels,
|
||||
metrics = Set(CountMetric, SumMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = TwitterWideUserAggregateStore,
|
||||
startDate = "2016-12-28 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
))
|
||||
)
|
||||
|
||||
val twitterWideUserAuthorAggregates = AggregateGroup(
|
||||
inputSource = timelinesDailyTwitterWideSource,
|
||||
preTransforms = Seq(RichRemoveUserIdZero), /* Eliminates reducer skew */
|
||||
aggregatePrefix = "twitter_wide_user_author_aggregate",
|
||||
keys = Set(USER_ID, AUTHOR_ID),
|
||||
features = RecapUserFeatureAggregation.TwitterWideFeatures,
|
||||
labels = RecapUserFeatureAggregation.TwitterWideLabels,
|
||||
metrics = Set(CountMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = TwitterWideUserAuthorAggregateStore,
|
||||
startDate = "2016-12-28 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
)),
|
||||
includeAnyLabel = false
|
||||
)
|
||||
|
||||
/**
|
||||
* User-HourOfDay and User-DayOfWeek aggregations, both for recap and rectweet
|
||||
*/
|
||||
val userRequestHourAggregates = AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "user_request_context_aggregate.hour",
|
||||
preTransforms = Seq(RichRemoveUserIdZero, negativeDownsampleTransform),
|
||||
keys = Set(USER_ID, RequestContextFeatures.TIMESTAMP_GMT_HOUR),
|
||||
features = Set.empty,
|
||||
labels = RecapUserFeatureAggregation.LabelsV2,
|
||||
metrics = Set(CountMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = UserRequestHourAggregateStore,
|
||||
startDate = "2017-08-01 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
))
|
||||
)
|
||||
|
||||
val userRequestDowAggregates = AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "user_request_context_aggregate.dow",
|
||||
preTransforms = Seq(RichRemoveUserIdZero, negativeDownsampleTransform),
|
||||
keys = Set(USER_ID, RequestContextFeatures.TIMESTAMP_GMT_DOW),
|
||||
features = Set.empty,
|
||||
labels = RecapUserFeatureAggregation.LabelsV2,
|
||||
metrics = Set(CountMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = UserRequestDowAggregateStore,
|
||||
startDate = "2017-08-01 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
))
|
||||
)
|
||||
|
||||
val authorTopicAggregates = AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "author_topic_aggregate",
|
||||
preTransforms = Seq(RichRemoveUserIdZero),
|
||||
keys = Set(AUTHOR_ID, TimelinesSharedFeatures.TOPIC_ID),
|
||||
features = Set.empty,
|
||||
labels = RecapUserFeatureAggregation.LabelsV2,
|
||||
metrics = Set(CountMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = AuthorTopicAggregateStore,
|
||||
startDate = "2020-05-19 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
))
|
||||
)
|
||||
|
||||
val userTopicAggregates = AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "user_topic_aggregate",
|
||||
preTransforms = Seq(RichRemoveUserIdZero),
|
||||
keys = Set(USER_ID, TimelinesSharedFeatures.TOPIC_ID),
|
||||
features = Set.empty,
|
||||
labels = RecapUserFeatureAggregation.LabelsV2,
|
||||
metrics = Set(CountMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = UserTopicAggregateStore,
|
||||
startDate = "2020-05-23 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
))
|
||||
)
|
||||
|
||||
val userTopicAggregatesV2 = AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "user_topic_aggregate_v2",
|
||||
preTransforms = Seq(RichRemoveUserIdZero),
|
||||
keys = Set(USER_ID, TimelinesSharedFeatures.TOPIC_ID),
|
||||
features = RecapUserFeatureAggregation.UserTopicFeaturesV2Count,
|
||||
labels = RecapUserFeatureAggregation.LabelsV2,
|
||||
includeAnyFeature = false,
|
||||
includeAnyLabel = false,
|
||||
metrics = Set(CountMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = UserTopicAggregateStore,
|
||||
startDate = "2020-05-23 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
))
|
||||
)
|
||||
|
||||
val userInferredTopicAggregates = AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "user_inferred_topic_aggregate",
|
||||
preTransforms = Seq(RichRemoveUserIdZero),
|
||||
keys = Set(USER_ID, TimelinesSharedFeatures.INFERRED_TOPIC_IDS),
|
||||
features = Set.empty,
|
||||
labels = RecapUserFeatureAggregation.LabelsV2,
|
||||
metrics = Set(CountMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = UserInferredTopicAggregateStore,
|
||||
startDate = "2020-09-09 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
))
|
||||
)
|
||||
|
||||
val userInferredTopicAggregatesV2 = AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "user_inferred_topic_aggregate_v2",
|
||||
preTransforms = Seq(RichRemoveUserIdZero),
|
||||
keys = Set(USER_ID, TimelinesSharedFeatures.INFERRED_TOPIC_IDS),
|
||||
features = RecapUserFeatureAggregation.UserTopicFeaturesV2Count,
|
||||
labels = RecapUserFeatureAggregation.LabelsV2,
|
||||
includeAnyFeature = false,
|
||||
includeAnyLabel = false,
|
||||
metrics = Set(CountMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = UserInferredTopicAggregateStore,
|
||||
startDate = "2020-09-09 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
))
|
||||
)
|
||||
|
||||
val userReciprocalEngagementAggregates = AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "user_aggregate_v6",
|
||||
preTransforms = Seq(RichRemoveUserIdZero),
|
||||
keys = Set(USER_ID),
|
||||
features = Set.empty,
|
||||
labels = RecapUserFeatureAggregation.ReciprocalLabels,
|
||||
metrics = Set(CountMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = UserAggregateStore,
|
||||
startDate = "2016-07-15 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
)),
|
||||
includeAnyLabel = false
|
||||
)
|
||||
|
||||
val userOriginalAuthorReciprocalEngagementAggregates = AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "user_original_author_aggregate_v1",
|
||||
preTransforms = Seq(RichRemoveUserIdZero, RichRemoveAuthorIdZero),
|
||||
keys = Set(USER_ID, TimelinesSharedFeatures.ORIGINAL_AUTHOR_ID),
|
||||
features = Set.empty,
|
||||
labels = RecapUserFeatureAggregation.ReciprocalLabels,
|
||||
metrics = Set(CountMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = UserOriginalAuthorAggregateStore,
|
||||
startDate = "2018-12-26 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
)),
|
||||
includeAnyLabel = false
|
||||
)
|
||||
|
||||
val originalAuthorReciprocalEngagementAggregates = AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "original_author_aggregate_v1",
|
||||
preTransforms = Seq(RichRemoveUserIdZero, RichRemoveAuthorIdZero),
|
||||
keys = Set(TimelinesSharedFeatures.ORIGINAL_AUTHOR_ID),
|
||||
features = Set.empty,
|
||||
labels = RecapUserFeatureAggregation.ReciprocalLabels,
|
||||
metrics = Set(CountMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = OriginalAuthorAggregateStore,
|
||||
startDate = "2023-02-25 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
)),
|
||||
includeAnyLabel = false
|
||||
)
|
||||
|
||||
val originalAuthorNegativeEngagementAggregates = AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "original_author_aggregate_v2",
|
||||
preTransforms = Seq(RichRemoveUserIdZero, RichRemoveAuthorIdZero),
|
||||
keys = Set(TimelinesSharedFeatures.ORIGINAL_AUTHOR_ID),
|
||||
features = Set.empty,
|
||||
labels = RecapUserFeatureAggregation.NegativeEngagementLabels,
|
||||
metrics = Set(CountMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = OriginalAuthorAggregateStore,
|
||||
startDate = "2023-02-25 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
)),
|
||||
includeAnyLabel = false
|
||||
)
|
||||
|
||||
val userListAggregates: AggregateGroup =
|
||||
AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "user_list_aggregate",
|
||||
keys = Set(USER_ID, ListFeatures.LIST_ID),
|
||||
features = Set.empty,
|
||||
labels = RecapUserFeatureAggregation.LabelsV2,
|
||||
metrics = Set(CountMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = UserListAggregateStore,
|
||||
startDate = "2020-05-28 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
)),
|
||||
preTransforms = Seq(RichRemoveUserIdZero)
|
||||
)
|
||||
|
||||
val userMediaUnderstandingAnnotationAggregates: AggregateGroup = AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "user_media_annotation_aggregate",
|
||||
preTransforms = Seq(RichRemoveUserIdZero),
|
||||
keys =
|
||||
Set(USER_ID, SemanticCoreFeatures.mediaUnderstandingHighRecallNonSensitiveEntityIdsFeature),
|
||||
features = Set.empty,
|
||||
labels = RecapUserFeatureAggregation.LabelsV2,
|
||||
metrics = Set(CountMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = UserMediaUnderstandingAnnotationAggregateStore,
|
||||
startDate = "2021-03-20 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink
|
||||
))
|
||||
)
|
||||
|
||||
val userAuthorGoodClickAggregates = AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "user_author_good_click_aggregate",
|
||||
preTransforms = Seq(FilterInNetworkTransform, RichRemoveUserIdZero),
|
||||
keys = Set(USER_ID, AUTHOR_ID),
|
||||
features = RecapUserFeatureAggregation.UserAuthorFeaturesV2,
|
||||
labels = RecapUserFeatureAggregation.GoodClickLabels,
|
||||
metrics = Set(SumMetric),
|
||||
halfLives = Set(14.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = UserAuthorAggregateStore,
|
||||
startDate = "2016-07-15 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
))
|
||||
)
|
||||
|
||||
val userEngagerGoodClickAggregates = AggregateGroup(
|
||||
inputSource = timelinesDailyRecapMinimalSource,
|
||||
aggregatePrefix = "user_engager_good_click_aggregate",
|
||||
keys = Set(USER_ID, EngagementDataRecordFeatures.PublicEngagementUserIds),
|
||||
features = Set.empty,
|
||||
labels = RecapUserFeatureAggregation.GoodClickLabels,
|
||||
metrics = Set(CountMetric),
|
||||
halfLives = Set(14.days),
|
||||
outputStore = mkPhysicalStore(
|
||||
OfflineAggregateDataRecordStore(
|
||||
name = UserEngagerAggregateStore,
|
||||
startDate = "2016-09-02 00:00",
|
||||
commonConfig = timelinesOfflineAggregateSink,
|
||||
maxKvSourceFailures = defaultMaxKvSourceFailures
|
||||
)),
|
||||
preTransforms = Seq(
|
||||
RichRemoveUserIdZero,
|
||||
RichUnifyPublicEngagersTransform
|
||||
)
|
||||
)
|
||||
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates
|
||||
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationConfig
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregateGroup
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
|
||||
|
||||
trait TimelinesAggregationConfigTrait
|
||||
extends TimelinesAggregationConfigDetails
|
||||
with AggregationConfig {
|
||||
private val aggregateGroups = Set(
|
||||
authorTopicAggregates,
|
||||
userTopicAggregates,
|
||||
userTopicAggregatesV2,
|
||||
userInferredTopicAggregates,
|
||||
userInferredTopicAggregatesV2,
|
||||
userAggregatesV2,
|
||||
userAggregatesV5Continuous,
|
||||
userReciprocalEngagementAggregates,
|
||||
userAuthorAggregatesV5,
|
||||
userOriginalAuthorReciprocalEngagementAggregates,
|
||||
originalAuthorReciprocalEngagementAggregates,
|
||||
tweetSourceUserAuthorAggregatesV1,
|
||||
userEngagerAggregates,
|
||||
userMentionAggregates,
|
||||
twitterWideUserAggregates,
|
||||
twitterWideUserAuthorAggregates,
|
||||
userRequestHourAggregates,
|
||||
userRequestDowAggregates,
|
||||
userListAggregates,
|
||||
userMediaUnderstandingAnnotationAggregates,
|
||||
) ++ userAuthorAggregatesV2
|
||||
|
||||
val aggregatesToComputeList: Set[List[TypedAggregateGroup[_]]] =
|
||||
aggregateGroups.map(_.buildTypedAggregateGroups())
|
||||
|
||||
override val aggregatesToCompute: Set[TypedAggregateGroup[_]] = aggregatesToComputeList.flatten
|
||||
|
||||
/*
|
||||
* Feature selection config to save storage space and manhattan query bandwidth.
|
||||
* Only the most important features found using offline RCE simulations are used
|
||||
* when actually training and serving. This selector is used by
|
||||
* [[com.twitter.timelines.data_processing.jobs.timeline_ranking_user_features.TimelineRankingAggregatesV2FeaturesProdJob]]
|
||||
* but defined here to keep it in sync with the config that computes the aggregates.
|
||||
*/
|
||||
val AggregatesV2FeatureSelector = FeatureSelectorConfig.AggregatesV2ProdFeatureSelector
|
||||
|
||||
def filterAggregatesGroups(storeNames: Set[String]): Set[AggregateGroup] = {
|
||||
aggregateGroups.filter(aggregateGroup => storeNames.contains(aggregateGroup.outputStore.name))
|
||||
}
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates
|
||||
|
||||
import com.twitter.ml.api.DataRecord
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
||||
import com.twitter.summingbird.batch.BatchID
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.{
|
||||
AggregateStore,
|
||||
AggregationKey,
|
||||
OfflineAggregateInjections,
|
||||
TypedAggregateGroup
|
||||
}
|
||||
|
||||
object TimelinesAggregationKeyValInjections extends TimelinesAggregationConfigTrait {
|
||||
|
||||
import OfflineAggregateInjections.getInjection
|
||||
|
||||
type KVInjection = KeyValInjection[AggregationKey, (BatchID, DataRecord)]
|
||||
|
||||
val AuthorTopic: KVInjection = getInjection(filter(AuthorTopicAggregateStore))
|
||||
val UserTopic: KVInjection = getInjection(filter(UserTopicAggregateStore))
|
||||
val UserInferredTopic: KVInjection = getInjection(filter(UserInferredTopicAggregateStore))
|
||||
val User: KVInjection = getInjection(filter(UserAggregateStore))
|
||||
val UserAuthor: KVInjection = getInjection(filter(UserAuthorAggregateStore))
|
||||
val UserOriginalAuthor: KVInjection = getInjection(filter(UserOriginalAuthorAggregateStore))
|
||||
val OriginalAuthor: KVInjection = getInjection(filter(OriginalAuthorAggregateStore))
|
||||
val UserEngager: KVInjection = getInjection(filter(UserEngagerAggregateStore))
|
||||
val UserMention: KVInjection = getInjection(filter(UserMentionAggregateStore))
|
||||
val TwitterWideUser: KVInjection = getInjection(filter(TwitterWideUserAggregateStore))
|
||||
val TwitterWideUserAuthor: KVInjection = getInjection(filter(TwitterWideUserAuthorAggregateStore))
|
||||
val UserRequestHour: KVInjection = getInjection(filter(UserRequestHourAggregateStore))
|
||||
val UserRequestDow: KVInjection = getInjection(filter(UserRequestDowAggregateStore))
|
||||
val UserList: KVInjection = getInjection(filter(UserListAggregateStore))
|
||||
val UserMediaUnderstandingAnnotation: KVInjection = getInjection(
|
||||
filter(UserMediaUnderstandingAnnotationAggregateStore))
|
||||
|
||||
private def filter(storeName: String): Set[TypedAggregateGroup[_]] = {
|
||||
val groups = aggregatesToCompute.filter(_.outputStore.name == storeName)
|
||||
require(groups.nonEmpty)
|
||||
groups
|
||||
}
|
||||
|
||||
override def outputHdfsPath: String = "/user/timelines/processed/aggregates_v2"
|
||||
|
||||
// Since this object is not used to execute any online or offline aggregates job, but is meant
|
||||
// to store all PDT enabled KeyValInjections, we do not need to construct a physical store.
|
||||
// We use the identity operation as a default.
|
||||
override def mkPhysicalStore(store: AggregateStore): AggregateStore = store
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates
|
||||
|
||||
import com.twitter.ml.api.constant.SharedFeatures.TIMESTAMP
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.OfflineAggregateSource
|
||||
import com.twitter.timelines.prediction.features.p_home_latest.HomeLatestUserAggregatesFeatures
|
||||
import timelines.data_processing.ad_hoc.recap.data_record_preparation.RecapDataRecordsAggMinimalJavaDataset
|
||||
|
||||
/**
|
||||
* Any update here should be in sync with [[TimelinesFeatureGroups]] and [[AggMinimalDataRecordGeneratorJob]].
|
||||
*/
|
||||
object TimelinesAggregationSources {
|
||||
|
||||
/**
|
||||
* This is the recap data records after post-processing in [[GenerateRecapAggMinimalDataRecordsJob]]
|
||||
*/
|
||||
val timelinesDailyRecapMinimalSource = OfflineAggregateSource(
|
||||
name = "timelines_daily_recap",
|
||||
timestampFeature = TIMESTAMP,
|
||||
dalDataSet = Some(RecapDataRecordsAggMinimalJavaDataset),
|
||||
scaldingSuffixType = Some("dal"),
|
||||
withValidation = true
|
||||
)
|
||||
val timelinesDailyTwitterWideSource = OfflineAggregateSource(
|
||||
name = "timelines_daily_twitter_wide",
|
||||
timestampFeature = TIMESTAMP,
|
||||
scaldingHdfsPath = Some("/user/timelines/processed/suggests/recap/twitter_wide_data_records"),
|
||||
scaldingSuffixType = Some("daily"),
|
||||
withValidation = true
|
||||
)
|
||||
|
||||
val timelinesDailyListTimelineSource = OfflineAggregateSource(
|
||||
name = "timelines_daily_list_timeline",
|
||||
timestampFeature = TIMESTAMP,
|
||||
scaldingHdfsPath = Some("/user/timelines/processed/suggests/recap/all_features/list"),
|
||||
scaldingSuffixType = Some("hourly"),
|
||||
withValidation = true
|
||||
)
|
||||
|
||||
val timelinesDailyHomeLatestSource = OfflineAggregateSource(
|
||||
name = "timelines_daily_home_latest",
|
||||
timestampFeature = HomeLatestUserAggregatesFeatures.AGGREGATE_TIMESTAMP_MS,
|
||||
scaldingHdfsPath = Some("/user/timelines/processed/p_home_latest/user_aggregates"),
|
||||
scaldingSuffixType = Some("daily")
|
||||
)
|
||||
}
|
|
@ -0,0 +1,70 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates.real_time
|
||||
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType.UserState
|
||||
import com.twitter.ml.api.Feature.Binary
|
||||
import com.twitter.ml.api.{DataRecord, Feature, FeatureContext, RichDataRecord}
|
||||
import com.twitter.ml.featurestore.catalog.entities.core.Author
|
||||
import com.twitter.ml.featurestore.catalog.features.magicrecs.UserActivity
|
||||
import com.twitter.ml.featurestore.lib.data.PredictionRecord
|
||||
import com.twitter.ml.featurestore.lib.feature.{BoundFeature, BoundFeatureSet}
|
||||
import com.twitter.ml.featurestore.lib.{UserId, Discrete => FSDiscrete}
|
||||
import com.twitter.timelines.prediction.common.adapters.TimelinesAdapterBase
|
||||
import java.lang.{Boolean => JBoolean}
|
||||
import java.util
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object AuthorFeaturesAdapter extends TimelinesAdapterBase[PredictionRecord] {
|
||||
val UserStateBoundFeature: BoundFeature[UserId, FSDiscrete] = UserActivity.UserState.bind(Author)
|
||||
val UserFeaturesSet: BoundFeatureSet = BoundFeatureSet(UserStateBoundFeature)
|
||||
|
||||
/**
|
||||
* Boolean features about viewer's user state.
|
||||
* enum UserState {
|
||||
* NEW = 0,
|
||||
* NEAR_ZERO = 1,
|
||||
* VERY_LIGHT = 2,
|
||||
* LIGHT = 3,
|
||||
* MEDIUM_TWEETER = 4,
|
||||
* MEDIUM_NON_TWEETER = 5,
|
||||
* HEAVY_NON_TWEETER = 6,
|
||||
* HEAVY_TWEETER = 7
|
||||
* }(persisted='true')
|
||||
*/
|
||||
val IS_USER_NEW = new Binary("timelines.author.user_state.is_user_new", Set(UserState).asJava)
|
||||
val IS_USER_LIGHT = new Binary("timelines.author.user_state.is_user_light", Set(UserState).asJava)
|
||||
val IS_USER_MEDIUM_TWEETER =
|
||||
new Binary("timelines.author.user_state.is_user_medium_tweeter", Set(UserState).asJava)
|
||||
val IS_USER_MEDIUM_NON_TWEETER =
|
||||
new Binary("timelines.author.user_state.is_user_medium_non_tweeter", Set(UserState).asJava)
|
||||
val IS_USER_HEAVY_NON_TWEETER =
|
||||
new Binary("timelines.author.user_state.is_user_heavy_non_tweeter", Set(UserState).asJava)
|
||||
val IS_USER_HEAVY_TWEETER =
|
||||
new Binary("timelines.author.user_state.is_user_heavy_tweeter", Set(UserState).asJava)
|
||||
val userStateToFeatureMap: Map[Long, Binary] = Map(
|
||||
0L -> IS_USER_NEW,
|
||||
1L -> IS_USER_LIGHT,
|
||||
2L -> IS_USER_LIGHT,
|
||||
3L -> IS_USER_LIGHT,
|
||||
4L -> IS_USER_MEDIUM_TWEETER,
|
||||
5L -> IS_USER_MEDIUM_NON_TWEETER,
|
||||
6L -> IS_USER_HEAVY_NON_TWEETER,
|
||||
7L -> IS_USER_HEAVY_TWEETER
|
||||
)
|
||||
|
||||
val UserStateBooleanFeatures: Set[Feature[_]] = userStateToFeatureMap.values.toSet
|
||||
|
||||
private val allFeatures: Seq[Feature[_]] = UserStateBooleanFeatures.toSeq
|
||||
override def getFeatureContext: FeatureContext = new FeatureContext(allFeatures: _*)
|
||||
override def commonFeatures: Set[Feature[_]] = Set.empty
|
||||
|
||||
override def adaptToDataRecords(record: PredictionRecord): util.List[DataRecord] = {
|
||||
val newRecord = new RichDataRecord(new DataRecord)
|
||||
record
|
||||
.getFeatureValue(UserStateBoundFeature)
|
||||
.flatMap { userState => userStateToFeatureMap.get(userState.value) }.foreach {
|
||||
booleanFeature => newRecord.setFeatureValue[JBoolean](booleanFeature, true)
|
||||
}
|
||||
|
||||
List(newRecord.getRecord).asJava
|
||||
}
|
||||
}
|
|
@ -0,0 +1,199 @@
|
|||
heron_binary(
|
||||
name = "heron-without-jass",
|
||||
main = "com.twitter.timelines.prediction.common.aggregates.real_time.TypeSafeRunner",
|
||||
oss = True,
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
":real_time",
|
||||
"3rdparty/jvm/org/slf4j:slf4j-jdk14",
|
||||
],
|
||||
)
|
||||
|
||||
jvm_app(
|
||||
name = "rta_heron",
|
||||
binary = ":heron-without-jass",
|
||||
bundles = [
|
||||
bundle(
|
||||
fileset = ["resources/jaas.conf"],
|
||||
),
|
||||
],
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-only",
|
||||
],
|
||||
)
|
||||
|
||||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
strict_deps = False,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
":online-configs",
|
||||
"3rdparty/src/jvm/com/twitter/summingbird:storm",
|
||||
"src/java/com/twitter/heron/util",
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/java/com/twitter/ml/api/constant",
|
||||
"src/scala/com/twitter/frigate/data_pipeline/features_aggregated/core:core-features",
|
||||
"src/scala/com/twitter/ml/api/util",
|
||||
"src/scala/com/twitter/storehaus_internal/memcache",
|
||||
"src/scala/com/twitter/storehaus_internal/util",
|
||||
"src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits",
|
||||
"src/scala/com/twitter/summingbird_internal/runner/store_config",
|
||||
"src/scala/com/twitter/summingbird_internal/runner/storm",
|
||||
"src/scala/com/twitter/summingbird_internal/sources/storm/remote:ClientEventSourceScrooge2",
|
||||
"src/scala/com/twitter/timelines/prediction/adapters/client_log_event",
|
||||
"src/scala/com/twitter/timelines/prediction/adapters/client_log_event_mr",
|
||||
"src/scala/com/twitter/timelines/prediction/features/client_log_event",
|
||||
"src/scala/com/twitter/timelines/prediction/features/common",
|
||||
"src/scala/com/twitter/timelines/prediction/features/list_features",
|
||||
"src/scala/com/twitter/timelines/prediction/features/recap",
|
||||
"src/scala/com/twitter/timelines/prediction/features/user_health",
|
||||
"src/thrift/com/twitter/ml/api:data-java",
|
||||
"src/thrift/com/twitter/timelines/suggests/common:record-scala",
|
||||
"timelinemixer/common/src/main/scala/com/twitter/timelinemixer/clients/served_features_cache",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
"timelines/data_processing/ml_util/aggregation_framework/heron",
|
||||
"timelines/data_processing/ml_util/aggregation_framework/job",
|
||||
"timelines/data_processing/ml_util/aggregation_framework/metrics",
|
||||
"timelines/data_processing/ml_util/transforms",
|
||||
"timelines/src/main/scala/com/twitter/timelines/clients/memcache_common",
|
||||
"util/util-core:scala",
|
||||
],
|
||||
)
|
||||
|
||||
scala_library(
|
||||
name = "online-configs",
|
||||
sources = [
|
||||
"AuthorFeaturesAdapter.scala",
|
||||
"Event.scala",
|
||||
"FeatureStoreUtils.scala",
|
||||
"StormAggregateSourceUtils.scala",
|
||||
"TimelinesOnlineAggregationConfig.scala",
|
||||
"TimelinesOnlineAggregationConfigBase.scala",
|
||||
"TimelinesOnlineAggregationSources.scala",
|
||||
"TimelinesStormAggregateSource.scala",
|
||||
"TweetFeaturesReadableStore.scala",
|
||||
"UserFeaturesAdapter.scala",
|
||||
"UserFeaturesReadableStore.scala",
|
||||
],
|
||||
platform = "java8",
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
":base-config",
|
||||
"3rdparty/src/jvm/com/twitter/scalding:db",
|
||||
"3rdparty/src/jvm/com/twitter/storehaus:core",
|
||||
"3rdparty/src/jvm/com/twitter/summingbird:core",
|
||||
"3rdparty/src/jvm/com/twitter/summingbird:online",
|
||||
"3rdparty/src/jvm/com/twitter/summingbird:storm",
|
||||
"abuse/detection/src/main/thrift/com/twitter/abuse/detection/mention_interactions:thrift-scala",
|
||||
"snowflake/src/main/scala/com/twitter/snowflake/id",
|
||||
"snowflake/src/main/thrift:thrift-scala",
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/java/com/twitter/ml/api/constant",
|
||||
"src/scala/com/twitter/frigate/data_pipeline/features_aggregated/core:core-features",
|
||||
"src/scala/com/twitter/ml/api/util:datarecord",
|
||||
"src/scala/com/twitter/ml/featurestore/catalog/datasets/geo:geo-user-location",
|
||||
"src/scala/com/twitter/ml/featurestore/catalog/datasets/magicrecs:user-features",
|
||||
"src/scala/com/twitter/ml/featurestore/catalog/entities/core",
|
||||
"src/scala/com/twitter/ml/featurestore/catalog/features/core:user",
|
||||
"src/scala/com/twitter/ml/featurestore/catalog/features/geo",
|
||||
"src/scala/com/twitter/ml/featurestore/catalog/features/magicrecs:user-activity",
|
||||
"src/scala/com/twitter/ml/featurestore/catalog/features/magicrecs:user-info",
|
||||
"src/scala/com/twitter/ml/featurestore/catalog/features/trends:tweet_trends_scores",
|
||||
"src/scala/com/twitter/ml/featurestore/lib/data",
|
||||
"src/scala/com/twitter/ml/featurestore/lib/dataset/offline",
|
||||
"src/scala/com/twitter/ml/featurestore/lib/export/strato:app-names",
|
||||
"src/scala/com/twitter/ml/featurestore/lib/feature",
|
||||
"src/scala/com/twitter/ml/featurestore/lib/online",
|
||||
"src/scala/com/twitter/ml/featurestore/lib/params",
|
||||
"src/scala/com/twitter/storehaus_internal/util",
|
||||
"src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits",
|
||||
"src/scala/com/twitter/summingbird_internal/runner/store_config",
|
||||
"src/scala/com/twitter/summingbird_internal/runner/storm",
|
||||
"src/scala/com/twitter/summingbird_internal/sources/common",
|
||||
"src/scala/com/twitter/summingbird_internal/sources/common/remote:ClientEventSourceScrooge",
|
||||
"src/scala/com/twitter/summingbird_internal/sources/storm/remote:ClientEventSourceScrooge2",
|
||||
"src/scala/com/twitter/timelines/prediction/adapters/client_log_event",
|
||||
"src/scala/com/twitter/timelines/prediction/adapters/client_log_event_mr",
|
||||
"src/scala/com/twitter/timelines/prediction/common/adapters:base",
|
||||
"src/scala/com/twitter/timelines/prediction/common/adapters:engagement-converter",
|
||||
"src/scala/com/twitter/timelines/prediction/common/aggregates",
|
||||
"src/scala/com/twitter/timelines/prediction/features/client_log_event",
|
||||
"src/scala/com/twitter/timelines/prediction/features/common",
|
||||
"src/scala/com/twitter/timelines/prediction/features/list_features",
|
||||
"src/scala/com/twitter/timelines/prediction/features/recap",
|
||||
"src/scala/com/twitter/timelines/prediction/features/user_health",
|
||||
"src/thrift/com/twitter/clientapp/gen:clientapp-scala",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
"src/thrift/com/twitter/ml/api:data-java",
|
||||
"src/thrift/com/twitter/timelines/suggests/common:engagement-java",
|
||||
"src/thrift/com/twitter/timelines/suggests/common:engagement-scala",
|
||||
"src/thrift/com/twitter/timelines/suggests/common:record-scala",
|
||||
"src/thrift/com/twitter/timelineservice/injection:thrift-scala",
|
||||
"src/thrift/com/twitter/timelineservice/server/suggests/logging:thrift-scala",
|
||||
"strato/src/main/scala/com/twitter/strato/client",
|
||||
"timelinemixer/common/src/main/scala/com/twitter/timelinemixer/clients/served_features_cache",
|
||||
"timelines/data_processing/ad_hoc/suggests/common:raw_training_data_creator",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
"timelines/data_processing/ml_util/aggregation_framework/heron:configs",
|
||||
"timelines/data_processing/ml_util/aggregation_framework/metrics",
|
||||
"timelines/data_processing/ml_util/transforms",
|
||||
"timelines/data_processing/util:rich-request",
|
||||
"tweetsource/common/src/main/thrift:thrift-scala",
|
||||
"twitter-server-internal/src/main/scala",
|
||||
"unified_user_actions/client/src/main/scala/com/twitter/unified_user_actions/client/config",
|
||||
"unified_user_actions/client/src/main/scala/com/twitter/unified_user_actions/client/summingbird",
|
||||
"unified_user_actions/thrift/src/main/thrift/com/twitter/unified_user_actions:unified_user_actions-scala",
|
||||
"util/util-core:scala",
|
||||
"util/util-stats/src/main/scala/com/twitter/finagle/stats",
|
||||
],
|
||||
)
|
||||
|
||||
scala_library(
|
||||
name = "base-config",
|
||||
sources = [
|
||||
"AuthorFeaturesAdapter.scala",
|
||||
"TimelinesOnlineAggregationConfigBase.scala",
|
||||
"TweetFeaturesAdapter.scala",
|
||||
"UserFeaturesAdapter.scala",
|
||||
],
|
||||
platform = "java8",
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/java/com/twitter/ml/api/constant",
|
||||
"src/resources/com/twitter/timelines/prediction/common/aggregates/real_time",
|
||||
"src/scala/com/twitter/ml/api/util:datarecord",
|
||||
"src/scala/com/twitter/ml/featurestore/catalog/datasets/magicrecs:user-features",
|
||||
"src/scala/com/twitter/ml/featurestore/catalog/entities/core",
|
||||
"src/scala/com/twitter/ml/featurestore/catalog/features/core:user",
|
||||
"src/scala/com/twitter/ml/featurestore/catalog/features/geo",
|
||||
"src/scala/com/twitter/ml/featurestore/catalog/features/magicrecs:user-activity",
|
||||
"src/scala/com/twitter/ml/featurestore/catalog/features/magicrecs:user-info",
|
||||
"src/scala/com/twitter/ml/featurestore/catalog/features/trends:tweet_trends_scores",
|
||||
"src/scala/com/twitter/ml/featurestore/lib/data",
|
||||
"src/scala/com/twitter/ml/featurestore/lib/feature",
|
||||
"src/scala/com/twitter/timelines/prediction/common/adapters:base",
|
||||
"src/scala/com/twitter/timelines/prediction/common/adapters:engagement-converter",
|
||||
"src/scala/com/twitter/timelines/prediction/common/aggregates",
|
||||
"src/scala/com/twitter/timelines/prediction/features/client_log_event",
|
||||
"src/scala/com/twitter/timelines/prediction/features/common",
|
||||
"src/scala/com/twitter/timelines/prediction/features/list_features",
|
||||
"src/scala/com/twitter/timelines/prediction/features/recap",
|
||||
"src/scala/com/twitter/timelines/prediction/features/user_health",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
"src/thrift/com/twitter/ml/api:feature_context-java",
|
||||
"src/thrift/com/twitter/timelines/suggests/common:engagement-scala",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
"timelines/data_processing/ml_util/aggregation_framework/heron:base-config",
|
||||
"timelines/data_processing/ml_util/aggregation_framework/metrics",
|
||||
"timelines/data_processing/ml_util/transforms",
|
||||
"util/util-core:scala",
|
||||
"util/util-core:util-core-util",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,11 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates.real_time
|
||||
|
||||
private[real_time] sealed trait Event[T] { def event: T }
|
||||
|
||||
private[real_time] case class HomeEvent[T](override val event: T) extends Event[T]
|
||||
|
||||
private[real_time] case class ProfileEvent[T](override val event: T) extends Event[T]
|
||||
|
||||
private[real_time] case class SearchEvent[T](override val event: T) extends Event[T]
|
||||
|
||||
private[real_time] case class UuaEvent[T](override val event: T) extends Event[T]
|
|
@ -0,0 +1,53 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates.real_time
|
||||
|
||||
import com.twitter.finagle.mtls.authentication.ServiceIdentifier
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.ml.featurestore.catalog.datasets.magicrecs.UserFeaturesDataset
|
||||
import com.twitter.ml.featurestore.catalog.datasets.geo.GeoUserLocationDataset
|
||||
import com.twitter.ml.featurestore.lib.dataset.DatasetParams
|
||||
import com.twitter.ml.featurestore.lib.export.strato.FeatureStoreAppNames
|
||||
import com.twitter.ml.featurestore.lib.online.FeatureStoreClient
|
||||
import com.twitter.ml.featurestore.lib.params.FeatureStoreParams
|
||||
import com.twitter.strato.client.{Client, Strato}
|
||||
import com.twitter.strato.opcontext.Attribution.ManhattanAppId
|
||||
import com.twitter.util.Duration
|
||||
|
||||
private[real_time] object FeatureStoreUtils {
|
||||
private def mkStratoClient(serviceIdentifier: ServiceIdentifier): Client =
|
||||
Strato.client
|
||||
.withMutualTls(serviceIdentifier)
|
||||
.withRequestTimeout(Duration.fromMilliseconds(50))
|
||||
.build()
|
||||
|
||||
private val featureStoreParams: FeatureStoreParams =
|
||||
FeatureStoreParams(
|
||||
perDataset = Map(
|
||||
UserFeaturesDataset.id ->
|
||||
DatasetParams(
|
||||
stratoSuffix = Some(FeatureStoreAppNames.Timelines),
|
||||
attributions = Seq(ManhattanAppId("athena", "timelines_aggregates_v2_features_by_user"))
|
||||
),
|
||||
GeoUserLocationDataset.id ->
|
||||
DatasetParams(
|
||||
attributions = Seq(ManhattanAppId("starbuck", "timelines_geo_features_by_user"))
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
def mkFeatureStoreClient(
|
||||
serviceIdentifier: ServiceIdentifier,
|
||||
statsReceiver: StatsReceiver
|
||||
): FeatureStoreClient = {
|
||||
com.twitter.server.Init() // necessary in order to use WilyNS path
|
||||
|
||||
val stratoClient: Client = mkStratoClient(serviceIdentifier)
|
||||
val featureStoreClient: FeatureStoreClient = FeatureStoreClient(
|
||||
featureSet =
|
||||
UserFeaturesAdapter.UserFeaturesSet ++ AuthorFeaturesAdapter.UserFeaturesSet ++ TweetFeaturesAdapter.TweetFeaturesSet,
|
||||
client = stratoClient,
|
||||
statsReceiver = statsReceiver,
|
||||
featureStoreParams = featureStoreParams
|
||||
)
|
||||
featureStoreClient
|
||||
}
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates.real_time
|
||||
|
||||
import com.twitter.conversions.DurationOps._
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.storehaus.ReplicatedReadableStore
|
||||
import com.twitter.storehaus.Store
|
||||
import com.twitter.timelines.clients.memcache_common._
|
||||
import com.twitter.timelines.util.FailOpenHandler
|
||||
import com.twitter.util.Future
|
||||
|
||||
object ServedFeaturesMemcacheConfigBuilder {
|
||||
def getTwCacheDestination(cluster: String, isProd: Boolean = false): String =
|
||||
if (!isProd) {
|
||||
s"/srv#/test/$cluster/cache//twemcache_timelines_served_features_cache"
|
||||
} else {
|
||||
s"/srv#/prod/$cluster/cache/timelines_served_features"
|
||||
}
|
||||
|
||||
/**
|
||||
* @cluster The DC of the cache that this client will send requests to. This
|
||||
* can be different to the DC where the summingbird job is running in.
|
||||
* @isProd Define if this client is part of a production summingbird job as
|
||||
* different accesspoints will need to be chosen.
|
||||
*/
|
||||
def build(cluster: String, isProd: Boolean = false): StorehausMemcacheConfig =
|
||||
StorehausMemcacheConfig(
|
||||
destName = getTwCacheDestination(cluster, isProd),
|
||||
keyPrefix = "",
|
||||
requestTimeout = 200.milliseconds,
|
||||
numTries = 2,
|
||||
globalTimeout = 400.milliseconds,
|
||||
tcpConnectTimeout = 200.milliseconds,
|
||||
connectionAcquisitionTimeout = 200.milliseconds,
|
||||
numPendingRequests = 1000,
|
||||
isReadOnly = false
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* If lookup key does not exist locally, make a call to the replicated store(s).
|
||||
* If value exists remotely, write the first returned value to the local store
|
||||
* and return it. Map any exceptions to None so that the subsequent operations
|
||||
* may proceed.
|
||||
*/
|
||||
class LocallyReplicatedStore[-K, V](
|
||||
localStore: Store[K, V],
|
||||
remoteStore: ReplicatedReadableStore[K, V],
|
||||
scopedStatsReceiver: StatsReceiver)
|
||||
extends Store[K, V] {
|
||||
private[this] val failOpenHandler = new FailOpenHandler(scopedStatsReceiver.scope("failOpen"))
|
||||
private[this] val localFailsCounter = scopedStatsReceiver.counter("localFails")
|
||||
private[this] val localWritesCounter = scopedStatsReceiver.counter("localWrites")
|
||||
private[this] val remoteFailsCounter = scopedStatsReceiver.counter("remoteFails")
|
||||
|
||||
override def get(k: K): Future[Option[V]] =
|
||||
failOpenHandler {
|
||||
localStore
|
||||
.get(k)
|
||||
.flatMap {
|
||||
case Some(v) => Future.value(Some(v))
|
||||
case _ => {
|
||||
localFailsCounter.incr()
|
||||
val replicatedOptFu = remoteStore.get(k)
|
||||
// async write if result is not empty
|
||||
replicatedOptFu.onSuccess {
|
||||
case Some(v) => {
|
||||
localWritesCounter.incr()
|
||||
localStore.put((k, Some(v)))
|
||||
}
|
||||
case _ => {
|
||||
remoteFailsCounter.incr()
|
||||
Unit
|
||||
}
|
||||
}
|
||||
replicatedOptFu
|
||||
}
|
||||
}
|
||||
} { _: Throwable => Future.None }
|
||||
}
|
|
@ -0,0 +1,254 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates.real_time
|
||||
|
||||
import com.twitter.finagle.stats.Counter
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.ml.api.constant.SharedFeatures
|
||||
import com.twitter.ml.api.DataRecord
|
||||
import com.twitter.ml.api.DataRecordMerger
|
||||
import com.twitter.ml.api.Feature
|
||||
import com.twitter.ml.api.RichDataRecord
|
||||
import com.twitter.ml.featurestore.catalog.entities.core.Author
|
||||
import com.twitter.ml.featurestore.catalog.entities.core.Tweet
|
||||
import com.twitter.ml.featurestore.catalog.entities.core.User
|
||||
import com.twitter.ml.featurestore.lib.online.FeatureStoreClient
|
||||
import com.twitter.summingbird.Producer
|
||||
import com.twitter.summingbird.storm.Storm
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron.RealTimeAggregatesJobConfig
|
||||
import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures
|
||||
import java.lang.{Long => JLong}
|
||||
|
||||
import com.twitter.unified_user_actions.thriftscala.ActionType
|
||||
import com.twitter.unified_user_actions.thriftscala.UnifiedUserAction
|
||||
|
||||
private[real_time] object StormAggregateSourceUtils {
|
||||
type UserId = Long
|
||||
type AuthorId = Long
|
||||
type TweetId = Long
|
||||
|
||||
/**
|
||||
* Attaches a [[FeatureStoreClient]] to the underyling [[Producer]]. The FeatureStoreClient
|
||||
* hydrates additional user features.
|
||||
*
|
||||
* @param underlyingProducer converts a stream of [[com.twitter.clientapp.thriftscala.LogEvent]]
|
||||
* to a stream of [[DataRecord]].
|
||||
*/
|
||||
def wrapByFeatureStoreClient(
|
||||
underlyingProducer: Producer[Storm, Event[DataRecord]],
|
||||
jobConfig: RealTimeAggregatesJobConfig,
|
||||
scopedStatsReceiver: StatsReceiver
|
||||
): Producer[Storm, Event[DataRecord]] = {
|
||||
lazy val keyDataRecordCounter = scopedStatsReceiver.counter("keyDataRecord")
|
||||
lazy val keyFeatureCounter = scopedStatsReceiver.counter("keyFeature")
|
||||
lazy val leftDataRecordCounter = scopedStatsReceiver.counter("leftDataRecord")
|
||||
lazy val rightDataRecordCounter = scopedStatsReceiver.counter("rightDataRecord")
|
||||
lazy val mergeNumFeaturesCounter = scopedStatsReceiver.counter("mergeNumFeatures")
|
||||
lazy val authorKeyDataRecordCounter = scopedStatsReceiver.counter("authorKeyDataRecord")
|
||||
lazy val authorKeyFeatureCounter = scopedStatsReceiver.counter("authorKeyFeature")
|
||||
lazy val authorLeftDataRecordCounter = scopedStatsReceiver.counter("authorLeftDataRecord")
|
||||
lazy val authorRightDataRecordCounter = scopedStatsReceiver.counter("authorRightDataRecord")
|
||||
lazy val authorMergeNumFeaturesCounter = scopedStatsReceiver.counter("authorMergeNumFeatures")
|
||||
lazy val tweetKeyDataRecordCounter =
|
||||
scopedStatsReceiver.counter("tweetKeyDataRecord")
|
||||
lazy val tweetKeyFeatureCounter = scopedStatsReceiver.counter("tweetKeyFeature")
|
||||
lazy val tweetLeftDataRecordCounter =
|
||||
scopedStatsReceiver.counter("tweetLeftDataRecord")
|
||||
lazy val tweetRightDataRecordCounter =
|
||||
scopedStatsReceiver.counter("tweetRightDataRecord")
|
||||
lazy val tweetMergeNumFeaturesCounter =
|
||||
scopedStatsReceiver.counter("tweetMergeNumFeatures")
|
||||
|
||||
@transient lazy val featureStoreClient: FeatureStoreClient =
|
||||
FeatureStoreUtils.mkFeatureStoreClient(
|
||||
serviceIdentifier = jobConfig.serviceIdentifier,
|
||||
statsReceiver = scopedStatsReceiver
|
||||
)
|
||||
|
||||
lazy val joinUserFeaturesDataRecordProducer =
|
||||
if (jobConfig.keyedByUserEnabled) {
|
||||
lazy val keyedByUserFeaturesStormService: Storm#Service[Set[UserId], DataRecord] =
|
||||
Storm.service(
|
||||
new UserFeaturesReadableStore(
|
||||
featureStoreClient = featureStoreClient,
|
||||
userEntity = User,
|
||||
userFeaturesAdapter = UserFeaturesAdapter
|
||||
)
|
||||
)
|
||||
|
||||
leftJoinDataRecordProducer(
|
||||
keyFeature = SharedFeatures.USER_ID,
|
||||
leftDataRecordProducer = underlyingProducer,
|
||||
rightStormService = keyedByUserFeaturesStormService,
|
||||
keyDataRecordCounter = keyDataRecordCounter,
|
||||
keyFeatureCounter = keyFeatureCounter,
|
||||
leftDataRecordCounter = leftDataRecordCounter,
|
||||
rightDataRecordCounter = rightDataRecordCounter,
|
||||
mergeNumFeaturesCounter = mergeNumFeaturesCounter
|
||||
)
|
||||
} else {
|
||||
underlyingProducer
|
||||
}
|
||||
|
||||
lazy val joinAuthorFeaturesDataRecordProducer =
|
||||
if (jobConfig.keyedByAuthorEnabled) {
|
||||
lazy val keyedByAuthorFeaturesStormService: Storm#Service[Set[AuthorId], DataRecord] =
|
||||
Storm.service(
|
||||
new UserFeaturesReadableStore(
|
||||
featureStoreClient = featureStoreClient,
|
||||
userEntity = Author,
|
||||
userFeaturesAdapter = AuthorFeaturesAdapter
|
||||
)
|
||||
)
|
||||
|
||||
leftJoinDataRecordProducer(
|
||||
keyFeature = TimelinesSharedFeatures.SOURCE_AUTHOR_ID,
|
||||
leftDataRecordProducer = joinUserFeaturesDataRecordProducer,
|
||||
rightStormService = keyedByAuthorFeaturesStormService,
|
||||
keyDataRecordCounter = authorKeyDataRecordCounter,
|
||||
keyFeatureCounter = authorKeyFeatureCounter,
|
||||
leftDataRecordCounter = authorLeftDataRecordCounter,
|
||||
rightDataRecordCounter = authorRightDataRecordCounter,
|
||||
mergeNumFeaturesCounter = authorMergeNumFeaturesCounter
|
||||
)
|
||||
} else {
|
||||
joinUserFeaturesDataRecordProducer
|
||||
}
|
||||
|
||||
lazy val joinTweetFeaturesDataRecordProducer = {
|
||||
if (jobConfig.keyedByTweetEnabled) {
|
||||
lazy val keyedByTweetFeaturesStormService: Storm#Service[Set[TweetId], DataRecord] =
|
||||
Storm.service(
|
||||
new TweetFeaturesReadableStore(
|
||||
featureStoreClient = featureStoreClient,
|
||||
tweetEntity = Tweet,
|
||||
tweetFeaturesAdapter = TweetFeaturesAdapter
|
||||
)
|
||||
)
|
||||
|
||||
leftJoinDataRecordProducer(
|
||||
keyFeature = TimelinesSharedFeatures.SOURCE_TWEET_ID,
|
||||
leftDataRecordProducer = joinAuthorFeaturesDataRecordProducer,
|
||||
rightStormService = keyedByTweetFeaturesStormService,
|
||||
keyDataRecordCounter = tweetKeyDataRecordCounter,
|
||||
keyFeatureCounter = tweetKeyFeatureCounter,
|
||||
leftDataRecordCounter = tweetLeftDataRecordCounter,
|
||||
rightDataRecordCounter = tweetRightDataRecordCounter,
|
||||
mergeNumFeaturesCounter = tweetMergeNumFeaturesCounter
|
||||
)
|
||||
} else {
|
||||
joinAuthorFeaturesDataRecordProducer
|
||||
}
|
||||
}
|
||||
|
||||
joinTweetFeaturesDataRecordProducer
|
||||
}
|
||||
|
||||
private[this] lazy val DataRecordMerger = new DataRecordMerger
|
||||
|
||||
/**
|
||||
* Make join key from the client event data record and return both.
|
||||
* @param keyFeature Feature to extract join key value: USER_ID, SOURCE_TWEET_ID, etc.
|
||||
* @param record DataRecord containing client engagement and basic tweet-side features
|
||||
* @return The return type is a tuple of this key and original data record which will be used
|
||||
* in the subsequent leftJoin operation.
|
||||
*/
|
||||
private[this] def mkKey(
|
||||
keyFeature: Feature[JLong],
|
||||
record: DataRecord,
|
||||
keyDataRecordCounter: Counter,
|
||||
keyFeatureCounter: Counter
|
||||
): Set[Long] = {
|
||||
keyDataRecordCounter.incr()
|
||||
val richRecord = new RichDataRecord(record)
|
||||
if (richRecord.hasFeature(keyFeature)) {
|
||||
keyFeatureCounter.incr()
|
||||
val key: Long = richRecord.getFeatureValue(keyFeature).toLong
|
||||
Set(key)
|
||||
} else {
|
||||
Set.empty[Long]
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* After the leftJoin, merge the client event data record and the joined data record
|
||||
* into a single data record used for further aggregation.
|
||||
*/
|
||||
private[this] def mergeDataRecord(
|
||||
leftRecord: Event[DataRecord],
|
||||
rightRecordOpt: Option[DataRecord],
|
||||
leftDataRecordCounter: Counter,
|
||||
rightDataRecordCounter: Counter,
|
||||
mergeNumFeaturesCounter: Counter
|
||||
): Event[DataRecord] = {
|
||||
leftDataRecordCounter.incr()
|
||||
rightRecordOpt.foreach { rightRecord =>
|
||||
rightDataRecordCounter.incr()
|
||||
DataRecordMerger.merge(leftRecord.event, rightRecord)
|
||||
mergeNumFeaturesCounter.incr(new RichDataRecord(leftRecord.event).numFeatures())
|
||||
}
|
||||
leftRecord
|
||||
}
|
||||
|
||||
private[this] def leftJoinDataRecordProducer(
|
||||
keyFeature: Feature[JLong],
|
||||
leftDataRecordProducer: Producer[Storm, Event[DataRecord]],
|
||||
rightStormService: Storm#Service[Set[Long], DataRecord],
|
||||
keyDataRecordCounter: => Counter,
|
||||
keyFeatureCounter: => Counter,
|
||||
leftDataRecordCounter: => Counter,
|
||||
rightDataRecordCounter: => Counter,
|
||||
mergeNumFeaturesCounter: => Counter
|
||||
): Producer[Storm, Event[DataRecord]] = {
|
||||
val keyedLeftDataRecordProducer: Producer[Storm, (Set[Long], Event[DataRecord])] =
|
||||
leftDataRecordProducer.map {
|
||||
case dataRecord: HomeEvent[DataRecord] =>
|
||||
val key = mkKey(
|
||||
keyFeature = keyFeature,
|
||||
record = dataRecord.event,
|
||||
keyDataRecordCounter = keyDataRecordCounter,
|
||||
keyFeatureCounter = keyFeatureCounter
|
||||
)
|
||||
(key, dataRecord)
|
||||
case dataRecord: ProfileEvent[DataRecord] =>
|
||||
val key = Set.empty[Long]
|
||||
(key, dataRecord)
|
||||
case dataRecord: SearchEvent[DataRecord] =>
|
||||
val key = Set.empty[Long]
|
||||
(key, dataRecord)
|
||||
case dataRecord: UuaEvent[DataRecord] =>
|
||||
val key = Set.empty[Long]
|
||||
(key, dataRecord)
|
||||
}
|
||||
|
||||
keyedLeftDataRecordProducer
|
||||
.leftJoin(rightStormService)
|
||||
.map {
|
||||
case (_, (leftRecord, rightRecordOpt)) =>
|
||||
mergeDataRecord(
|
||||
leftRecord = leftRecord,
|
||||
rightRecordOpt = rightRecordOpt,
|
||||
leftDataRecordCounter = leftDataRecordCounter,
|
||||
rightDataRecordCounter = rightDataRecordCounter,
|
||||
mergeNumFeaturesCounter = mergeNumFeaturesCounter
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter Unified User Actions events to include only actions that has home timeline visit prior to landing on the page
|
||||
*/
|
||||
def isUuaBCEEventsFromHome(event: UnifiedUserAction): Boolean = {
|
||||
def breadcrumbViewsContain(view: String): Boolean =
|
||||
event.eventMetadata.breadcrumbViews.map(_.contains(view)).getOrElse(false)
|
||||
|
||||
(event.actionType) match {
|
||||
case ActionType.ClientTweetV2Impression if breadcrumbViewsContain("home") =>
|
||||
true
|
||||
case ActionType.ClientTweetVideoFullscreenV2Impression
|
||||
if (breadcrumbViewsContain("home") & breadcrumbViewsContain("video")) =>
|
||||
true
|
||||
case ActionType.ClientProfileV2Impression if breadcrumbViewsContain("home") =>
|
||||
true
|
||||
case _ => false
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates.real_time
|
||||
|
||||
import com.twitter.conversions.DurationOps._
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron.{
|
||||
OnlineAggregationStoresTrait,
|
||||
RealTimeAggregateStore
|
||||
}
|
||||
|
||||
object TimelinesOnlineAggregationConfig
|
||||
extends TimelinesOnlineAggregationDefinitionsTrait
|
||||
with OnlineAggregationStoresTrait {
|
||||
|
||||
import TimelinesOnlineAggregationSources._
|
||||
|
||||
override lazy val ProductionStore = RealTimeAggregateStore(
|
||||
memcacheDataSet = "timelines_real_time_aggregates",
|
||||
isProd = true,
|
||||
cacheTTL = 5.days
|
||||
)
|
||||
|
||||
override lazy val StagingStore = RealTimeAggregateStore(
|
||||
memcacheDataSet = "twemcache_timelines_real_time_aggregates",
|
||||
isProd = false,
|
||||
cacheTTL = 5.days
|
||||
)
|
||||
|
||||
override lazy val inputSource = timelinesOnlineAggregateSource
|
||||
|
||||
/**
|
||||
* AggregateToCompute: This defines the complete set of aggregates to be
|
||||
* computed by the aggregation job and to be stored in memcache.
|
||||
*/
|
||||
override lazy val AggregatesToCompute = ProdAggregates ++ StagingAggregates
|
||||
}
|
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,5 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates.real_time
|
||||
|
||||
object TimelinesOnlineAggregationSources {
|
||||
val timelinesOnlineAggregateSource = new TimelinesStormAggregateSource
|
||||
}
|
|
@ -0,0 +1,182 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates.real_time
|
||||
|
||||
import com.twitter.conversions.DurationOps._
|
||||
import com.twitter.finagle.stats.DefaultStatsReceiver
|
||||
import com.twitter.summingbird.Options
|
||||
import com.twitter.summingbird.online.option.FlatMapParallelism
|
||||
import com.twitter.summingbird.online.option.SourceParallelism
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron._
|
||||
import com.twitter.timelines.data_processing.ml_util.transforms.DownsampleTransform
|
||||
import com.twitter.timelines.data_processing.ml_util.transforms.RichITransform
|
||||
import com.twitter.timelines.data_processing.ml_util.transforms.UserDownsampleTransform
|
||||
|
||||
import com.twitter.timelines.prediction.common.aggregates.BCELabelTransformFromUUADataRecord
|
||||
|
||||
/**
|
||||
* Sets up relevant topology parameters. Our primary goal is to handle the
|
||||
* LogEvent stream and aggregate (sum) on the parsed DataRecords without falling
|
||||
* behind. Our constraint is the resulting write (and read) QPS to the backing
|
||||
* memcache store.
|
||||
*
|
||||
* If the job is falling behind, add more flatMappers and/or Summers after
|
||||
* inspecting the viz panels for the respective job (go/heron-ui). An increase in
|
||||
* Summers (and/or aggregation keys and features in the config) results in an
|
||||
* increase in memcache QPS (go/cb and search for our cache). Adjust with CacheSize
|
||||
* settings until QPS is well-controlled.
|
||||
*
|
||||
*/
|
||||
object TimelinesRealTimeAggregatesJobConfigs extends RealTimeAggregatesJobConfigs {
|
||||
import TimelinesOnlineAggregationUtils._
|
||||
|
||||
/**
|
||||
* We remove input records that do not contain a label/engagement as defined in AllTweetLabels, which includes
|
||||
* explicit user engagements including public, private and impression events. By avoiding ingesting records without
|
||||
* engagemnts, we guarantee that no distribution shifts occur in computed aggregate features when we add a new spout
|
||||
* to input aggregate sources. Counterfactual signal is still available since we aggregate on explicit dwell
|
||||
* engagements.
|
||||
*/
|
||||
val NegativeDownsampleTransform =
|
||||
DownsampleTransform(
|
||||
negativeSamplingRate = 0.0,
|
||||
keepLabels = AllTweetLabels,
|
||||
positiveSamplingRate = 1.0)
|
||||
|
||||
/**
|
||||
* We downsample positive engagements for devel topology to reduce traffic, aiming for equivalent of 10% of prod traffic.
|
||||
* First apply consistent downsampling to 10% of users, and then apply downsampling to remove records without
|
||||
* explicit labels. We apply user-consistent sampling to more closely approximate prod query patterns.
|
||||
*/
|
||||
val StagingUserBasedDownsampleTransform =
|
||||
UserDownsampleTransform(
|
||||
availability = 1000,
|
||||
featureName = "rta_devel"
|
||||
)
|
||||
|
||||
override val Prod = RealTimeAggregatesJobConfig(
|
||||
appId = "summingbird_timelines_rta",
|
||||
topologyWorkers = 1450,
|
||||
sourceCount = 120,
|
||||
flatMapCount = 1800,
|
||||
summerCount = 3850,
|
||||
cacheSize = 200,
|
||||
containerRamGigaBytes = 54,
|
||||
name = "timelines_real_time_aggregates",
|
||||
teamName = "timelines",
|
||||
teamEmail = "",
|
||||
// If one component is hitting GC limit at prod, tune componentToMetaSpaceSizeMap.
|
||||
// Except for Source bolts. Tune componentToRamGigaBytesMap for Source bolts instead.
|
||||
componentToMetaSpaceSizeMap = Map(
|
||||
"Tail-FlatMap" -> "-XX:MaxMetaspaceSize=1024M -XX:MetaspaceSize=1024M",
|
||||
"Tail" -> "-XX:MaxMetaspaceSize=2560M -XX:MetaspaceSize=2560M"
|
||||
),
|
||||
// If either component is hitting memory limit at prod
|
||||
// its memory need to increase: either increase total memory of container (containerRamGigaBytes),
|
||||
// or allocate more memory for one component while keeping total memory unchanged.
|
||||
componentToRamGigaBytesMap = Map(
|
||||
"Tail-FlatMap-Source" -> 3, // Home source
|
||||
"Tail-FlatMap-Source.2" -> 3, // Profile source
|
||||
"Tail-FlatMap-Source.3" -> 3, // Search source
|
||||
"Tail-FlatMap-Source.4" -> 3, // UUA source
|
||||
"Tail-FlatMap" -> 8
|
||||
// Tail will use the leftover memory in the container.
|
||||
// Make sure to tune topologyWorkers and containerRamGigaBytes such that this is greater than 10 GB.
|
||||
),
|
||||
topologyNamedOptions = Map(
|
||||
"TL_EVENTS_SOURCE" -> Options()
|
||||
.set(SourceParallelism(120)),
|
||||
"PROFILE_EVENTS_SOURCE" -> Options()
|
||||
.set(SourceParallelism(30)),
|
||||
"SEARCH_EVENTS_SOURCE" -> Options()
|
||||
.set(SourceParallelism(10)),
|
||||
"UUA_EVENTS_SOURCE" -> Options()
|
||||
.set(SourceParallelism(10)),
|
||||
"COMBINED_PRODUCER" -> Options()
|
||||
.set(FlatMapParallelism(1800))
|
||||
),
|
||||
// The UUA datarecord for BCE events inputted will not have binary labels populated.
|
||||
// BCELabelTransform will set the datarecord with binary BCE dwell labels features based on the corresponding dwell_time_ms.
|
||||
// It's important to have the BCELabelTransformFromUUADataRecord before ProdNegativeDownsampleTransform
|
||||
// because ProdNegativeDownsampleTransform will remove datarecord that contains no features from AllTweetLabels.
|
||||
onlinePreTransforms =
|
||||
Seq(RichITransform(BCELabelTransformFromUUADataRecord), NegativeDownsampleTransform)
|
||||
)
|
||||
|
||||
/**
|
||||
* we downsample 10% computation of devel RTA based on [[StagingNegativeDownsampleTransform]].
|
||||
* To better test scalability of topology, we reduce computing resource of components "Tail-FlatMap"
|
||||
* and "Tail" to be 10% of prod but keep computing resource of component "Tail-FlatMap-Source" unchanged.
|
||||
* hence flatMapCount=110, summerCount=105 and sourceCount=100. Hence topologyWorkers =(110+105+100)/5 = 63.
|
||||
*/
|
||||
override val Devel = RealTimeAggregatesJobConfig(
|
||||
appId = "summingbird_timelines_rta_devel",
|
||||
topologyWorkers = 120,
|
||||
sourceCount = 120,
|
||||
flatMapCount = 150,
|
||||
summerCount = 300,
|
||||
cacheSize = 200,
|
||||
containerRamGigaBytes = 54,
|
||||
name = "timelines_real_time_aggregates_devel",
|
||||
teamName = "timelines",
|
||||
teamEmail = "",
|
||||
// If one component is hitting GC limit at prod, tune componentToMetaSpaceSizeMap
|
||||
// Except for Source bolts. Tune componentToRamGigaBytesMap for Source bolts instead.
|
||||
componentToMetaSpaceSizeMap = Map(
|
||||
"Tail-FlatMap" -> "-XX:MaxMetaspaceSize=1024M -XX:MetaspaceSize=1024M",
|
||||
"Tail" -> "-XX:MaxMetaspaceSize=2560M -XX:MetaspaceSize=2560M"
|
||||
),
|
||||
// If either component is hitting memory limit at prod
|
||||
// its memory need to increase: either increase total memory of container (containerRamGigaBytes),
|
||||
// or allocate more memory for one component while keeping total memory unchanged.
|
||||
componentToRamGigaBytesMap = Map(
|
||||
"Tail-FlatMap-Source" -> 3, // Home source
|
||||
"Tail-FlatMap-Source.2" -> 3, // Profile source
|
||||
"Tail-FlatMap-Source.3" -> 3, // Search source
|
||||
"Tail-FlatMap-Source.4" -> 3, // UUA source
|
||||
"Tail-FlatMap" -> 8
|
||||
// Tail will use the leftover memory in the container.
|
||||
// Make sure to tune topologyWorkers and containerRamGigaBytes such that this is greater than 10 GB.
|
||||
),
|
||||
topologyNamedOptions = Map(
|
||||
"TL_EVENTS_SOURCE" -> Options()
|
||||
.set(SourceParallelism(120)),
|
||||
"PROFILE_EVENTS_SOURCE" -> Options()
|
||||
.set(SourceParallelism(30)),
|
||||
"SEARCH_EVENTS_SOURCE" -> Options()
|
||||
.set(SourceParallelism(10)),
|
||||
"UUA_EVENTS_SOURCE" -> Options()
|
||||
.set(SourceParallelism(10)),
|
||||
"COMBINED_PRODUCER" -> Options()
|
||||
.set(FlatMapParallelism(150))
|
||||
),
|
||||
// It's important to have the BCELabelTransformFromUUADataRecord before ProdNegativeDownsampleTransform
|
||||
onlinePreTransforms = Seq(
|
||||
StagingUserBasedDownsampleTransform,
|
||||
RichITransform(BCELabelTransformFromUUADataRecord),
|
||||
NegativeDownsampleTransform),
|
||||
enableUserReindexingNighthawkBtreeStore = true,
|
||||
enableUserReindexingNighthawkHashStore = true,
|
||||
userReindexingNighthawkBtreeStoreConfig = NighthawkUnderlyingStoreConfig(
|
||||
serversetPath =
|
||||
"/twitter/service/cache-user/test/nighthawk_timelines_real_time_aggregates_btree_test_api",
|
||||
// NOTE: table names are prefixed to every pkey so keep it short
|
||||
tableName = "u_r_v1", // (u)ser_(r)eindexing_v1
|
||||
// keep ttl <= 1 day because it's keyed on user, and we will have limited hit rates beyond 1 day
|
||||
cacheTTL = 1.day
|
||||
),
|
||||
userReindexingNighthawkHashStoreConfig = NighthawkUnderlyingStoreConfig(
|
||||
// For prod: "/s/cache-user/nighthawk_timelines_real_time_aggregates_hash_api",
|
||||
serversetPath =
|
||||
"/twitter/service/cache-user/test/nighthawk_timelines_real_time_aggregates_hash_test_api",
|
||||
// NOTE: table names are prefixed to every pkey so keep it short
|
||||
tableName = "u_r_v1", // (u)ser_(r)eindexing_v1
|
||||
// keep ttl <= 1 day because it's keyed on user, and we will have limited hit rates beyond 1 day
|
||||
cacheTTL = 1.day
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
object TimelinesRealTimeAggregatesJob extends RealTimeAggregatesJobBase {
|
||||
override lazy val statsReceiver = DefaultStatsReceiver.scope("timelines_real_time_aggregates")
|
||||
override lazy val jobConfigs = TimelinesRealTimeAggregatesJobConfigs
|
||||
override lazy val aggregatesToCompute = TimelinesOnlineAggregationConfig.AggregatesToCompute
|
||||
}
|
|
@ -0,0 +1,185 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates.real_time
|
||||
|
||||
import com.twitter.clientapp.thriftscala.LogEvent
|
||||
import com.twitter.conversions.DurationOps._
|
||||
import com.twitter.finagle.stats.Counter
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.ml.api.DataRecord
|
||||
import com.twitter.ml.api.constant.SharedFeatures
|
||||
import com.twitter.snowflake.id.SnowflakeId
|
||||
import com.twitter.summingbird._
|
||||
import com.twitter.summingbird.storm.Storm
|
||||
import com.twitter.summingbird_internal.sources.AppId
|
||||
import com.twitter.summingbird_internal.sources.storm.remote.ClientEventSourceScrooge2
|
||||
import com.twitter.timelines.data_processing.ad_hoc.suggests.common.AllScribeProcessor
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron.RealTimeAggregatesJobConfig
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron.StormAggregateSource
|
||||
import com.twitter.timelines.prediction.adapters.client_log_event.ClientLogEventAdapter
|
||||
import com.twitter.timelines.prediction.adapters.client_log_event.ProfileClientLogEventAdapter
|
||||
import com.twitter.timelines.prediction.adapters.client_log_event.SearchClientLogEventAdapter
|
||||
import com.twitter.timelines.prediction.adapters.client_log_event.UuaEventAdapter
|
||||
import com.twitter.unified_user_actions.client.config.KafkaConfigs
|
||||
import com.twitter.unified_user_actions.client.summingbird.UnifiedUserActionsSourceScrooge
|
||||
import com.twitter.unified_user_actions.thriftscala.UnifiedUserAction
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
/**
|
||||
* Storm Producer for client events generated on Home, Profile, and Search
|
||||
*/
|
||||
class TimelinesStormAggregateSource extends StormAggregateSource {
|
||||
|
||||
override val name = "timelines_rta"
|
||||
override val timestampFeature = SharedFeatures.TIMESTAMP
|
||||
|
||||
private lazy val TimelinesClientEventSourceName = "TL_EVENTS_SOURCE"
|
||||
private lazy val ProfileClientEventSourceName = "PROFILE_EVENTS_SOURCE"
|
||||
private lazy val SearchClientEventSourceName = "SEARCH_EVENTS_SOURCE"
|
||||
private lazy val UuaEventSourceName = "UUA_EVENTS_SOURCE"
|
||||
private lazy val CombinedProducerName = "COMBINED_PRODUCER"
|
||||
private lazy val FeatureStoreProducerName = "FEATURE_STORE_PRODUCER"
|
||||
|
||||
private def isNewUserEvent(event: LogEvent): Boolean = {
|
||||
event.logBase.flatMap(_.userId).flatMap(SnowflakeId.timeFromIdOpt).exists(_.untilNow < 30.days)
|
||||
}
|
||||
|
||||
private def mkDataRecords(event: LogEvent, dataRecordCounter: Counter): Seq[DataRecord] = {
|
||||
val dataRecords: Seq[DataRecord] =
|
||||
if (AllScribeProcessor.isValidSuggestTweetEvent(event)) {
|
||||
ClientLogEventAdapter.adaptToDataRecords(event).asScala
|
||||
} else {
|
||||
Seq.empty[DataRecord]
|
||||
}
|
||||
dataRecordCounter.incr(dataRecords.size)
|
||||
dataRecords
|
||||
}
|
||||
|
||||
private def mkProfileDataRecords(
|
||||
event: LogEvent,
|
||||
dataRecordCounter: Counter
|
||||
): Seq[DataRecord] = {
|
||||
val dataRecords: Seq[DataRecord] =
|
||||
ProfileClientLogEventAdapter.adaptToDataRecords(event).asScala
|
||||
dataRecordCounter.incr(dataRecords.size)
|
||||
dataRecords
|
||||
}
|
||||
|
||||
private def mkSearchDataRecords(
|
||||
event: LogEvent,
|
||||
dataRecordCounter: Counter
|
||||
): Seq[DataRecord] = {
|
||||
val dataRecords: Seq[DataRecord] =
|
||||
SearchClientLogEventAdapter.adaptToDataRecords(event).asScala
|
||||
dataRecordCounter.incr(dataRecords.size)
|
||||
dataRecords
|
||||
}
|
||||
|
||||
private def mkUuaDataRecords(
|
||||
event: UnifiedUserAction,
|
||||
dataRecordCounter: Counter
|
||||
): Seq[DataRecord] = {
|
||||
val dataRecords: Seq[DataRecord] =
|
||||
UuaEventAdapter.adaptToDataRecords(event).asScala
|
||||
dataRecordCounter.incr(dataRecords.size)
|
||||
dataRecords
|
||||
}
|
||||
|
||||
override def build(
|
||||
statsReceiver: StatsReceiver,
|
||||
jobConfig: RealTimeAggregatesJobConfig
|
||||
): Producer[Storm, DataRecord] = {
|
||||
lazy val scopedStatsReceiver = statsReceiver.scope(getClass.getSimpleName)
|
||||
lazy val dataRecordCounter = scopedStatsReceiver.counter("dataRecord")
|
||||
|
||||
// Home Timeline Engagements
|
||||
// Step 1: => LogEvent
|
||||
lazy val clientEventProducer: Producer[Storm, HomeEvent[LogEvent]] =
|
||||
ClientEventSourceScrooge2(
|
||||
appId = AppId(jobConfig.appId),
|
||||
topic = "julep_client_event_suggests",
|
||||
resumeAtLastReadOffset = false,
|
||||
enableTls = true
|
||||
).source.map(HomeEvent[LogEvent]).name(TimelinesClientEventSourceName)
|
||||
|
||||
// Profile Engagements
|
||||
// Step 1: => LogEvent
|
||||
lazy val profileClientEventProducer: Producer[Storm, ProfileEvent[LogEvent]] =
|
||||
ClientEventSourceScrooge2(
|
||||
appId = AppId(jobConfig.appId),
|
||||
topic = "julep_client_event_profile_real_time_engagement_metrics",
|
||||
resumeAtLastReadOffset = false,
|
||||
enableTls = true
|
||||
).source
|
||||
.map(ProfileEvent[LogEvent])
|
||||
.name(ProfileClientEventSourceName)
|
||||
|
||||
// Search Engagements
|
||||
// Step 1: => LogEvent
|
||||
// Only process events for all users to save resource
|
||||
lazy val searchClientEventProducer: Producer[Storm, SearchEvent[LogEvent]] =
|
||||
ClientEventSourceScrooge2(
|
||||
appId = AppId(jobConfig.appId),
|
||||
topic = "julep_client_event_search_real_time_engagement_metrics",
|
||||
resumeAtLastReadOffset = false,
|
||||
enableTls = true
|
||||
).source
|
||||
.map(SearchEvent[LogEvent])
|
||||
.name(SearchClientEventSourceName)
|
||||
|
||||
// Unified User Actions (includes Home and other product surfaces)
|
||||
lazy val uuaEventProducer: Producer[Storm, UuaEvent[UnifiedUserAction]] =
|
||||
UnifiedUserActionsSourceScrooge(
|
||||
appId = AppId(jobConfig.appId),
|
||||
parallelism = 10,
|
||||
kafkaConfig = KafkaConfigs.ProdUnifiedUserActionsEngagementOnly
|
||||
).source
|
||||
.filter(StormAggregateSourceUtils.isUuaBCEEventsFromHome(_))
|
||||
.map(UuaEvent[UnifiedUserAction])
|
||||
.name(UuaEventSourceName)
|
||||
|
||||
// Combined
|
||||
// Step 2:
|
||||
// (a) Combine
|
||||
// (b) Transform LogEvent => Seq[DataRecord]
|
||||
// (c) Apply sampler
|
||||
lazy val combinedClientEventDataRecordProducer: Producer[Storm, Event[DataRecord]] =
|
||||
profileClientEventProducer // This becomes the bottom branch
|
||||
.merge(clientEventProducer) // This becomes the middle branch
|
||||
.merge(searchClientEventProducer)
|
||||
.merge(uuaEventProducer) // This becomes the top
|
||||
.flatMap { // LogEvent => Seq[DataRecord]
|
||||
case e: HomeEvent[LogEvent] =>
|
||||
mkDataRecords(e.event, dataRecordCounter).map(HomeEvent[DataRecord])
|
||||
case e: ProfileEvent[LogEvent] =>
|
||||
mkProfileDataRecords(e.event, dataRecordCounter).map(ProfileEvent[DataRecord])
|
||||
case e: SearchEvent[LogEvent] =>
|
||||
mkSearchDataRecords(e.event, dataRecordCounter).map(SearchEvent[DataRecord])
|
||||
case e: UuaEvent[UnifiedUserAction] =>
|
||||
mkUuaDataRecords(
|
||||
e.event,
|
||||
dataRecordCounter
|
||||
).map(UuaEvent[DataRecord])
|
||||
}
|
||||
.flatMap { // Apply sampler
|
||||
case e: HomeEvent[DataRecord] =>
|
||||
jobConfig.sequentiallyTransform(e.event).map(HomeEvent[DataRecord])
|
||||
case e: ProfileEvent[DataRecord] =>
|
||||
jobConfig.sequentiallyTransform(e.event).map(ProfileEvent[DataRecord])
|
||||
case e: SearchEvent[DataRecord] =>
|
||||
jobConfig.sequentiallyTransform(e.event).map(SearchEvent[DataRecord])
|
||||
case e: UuaEvent[DataRecord] =>
|
||||
jobConfig.sequentiallyTransform(e.event).map(UuaEvent[DataRecord])
|
||||
}
|
||||
.name(CombinedProducerName)
|
||||
|
||||
// Step 3: Join with Feature Store features
|
||||
lazy val featureStoreDataRecordProducer: Producer[Storm, DataRecord] =
|
||||
StormAggregateSourceUtils
|
||||
.wrapByFeatureStoreClient(
|
||||
underlyingProducer = combinedClientEventDataRecordProducer,
|
||||
jobConfig = jobConfig,
|
||||
scopedStatsReceiver = scopedStatsReceiver
|
||||
).map(_.event).name(FeatureStoreProducerName)
|
||||
|
||||
featureStoreDataRecordProducer
|
||||
}
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates.real_time
|
||||
|
||||
import com.twitter.ml.api.DataRecord
|
||||
import com.twitter.ml.api.Feature
|
||||
import com.twitter.ml.api.FeatureContext
|
||||
import com.twitter.ml.featurestore.catalog.entities.core.Tweet
|
||||
import com.twitter.ml.featurestore.catalog.features.trends.TweetTrendsScores
|
||||
import com.twitter.ml.featurestore.lib.TweetId
|
||||
import com.twitter.ml.featurestore.lib.data.PredictionRecord
|
||||
import com.twitter.ml.featurestore.lib.data.PredictionRecordAdapter
|
||||
import com.twitter.ml.featurestore.lib.feature.BoundFeature
|
||||
import com.twitter.ml.featurestore.lib.feature.BoundFeatureSet
|
||||
import com.twitter.timelines.prediction.common.adapters.TimelinesAdapterBase
|
||||
import java.util
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object TweetFeaturesAdapter extends TimelinesAdapterBase[PredictionRecord] {
|
||||
|
||||
private val ContinuousFeatureMap: Map[BoundFeature[TweetId, Double], Feature.Continuous] = Map()
|
||||
|
||||
val TweetFeaturesSet: BoundFeatureSet = new BoundFeatureSet(ContinuousFeatureMap.keys.toSet)
|
||||
|
||||
val AllFeatures: Seq[Feature[_]] =
|
||||
ContinuousFeatureMap.values.toSeq
|
||||
|
||||
private val adapter = PredictionRecordAdapter.oneToOne(TweetFeaturesSet)
|
||||
|
||||
override def getFeatureContext: FeatureContext = new FeatureContext(AllFeatures: _*)
|
||||
|
||||
override def commonFeatures: Set[Feature[_]] = Set.empty
|
||||
|
||||
override def adaptToDataRecords(record: PredictionRecord): util.List[DataRecord] = {
|
||||
List(adapter.adaptToDataRecord(record)).asJava
|
||||
}
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates.real_time
|
||||
|
||||
import com.twitter.ml.api.DataRecord
|
||||
import com.twitter.ml.featurestore.lib.TweetId
|
||||
import com.twitter.ml.featurestore.lib.data.PredictionRecord
|
||||
import com.twitter.ml.featurestore.lib.entity.Entity
|
||||
import com.twitter.ml.featurestore.lib.online.{FeatureStoreClient, FeatureStoreRequest}
|
||||
import com.twitter.storehaus.ReadableStore
|
||||
import com.twitter.timelines.prediction.common.adapters.TimelinesAdapterBase
|
||||
import com.twitter.util.Future
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
class TweetFeaturesReadableStore(
|
||||
featureStoreClient: FeatureStoreClient,
|
||||
tweetEntity: Entity[TweetId],
|
||||
tweetFeaturesAdapter: TimelinesAdapterBase[PredictionRecord])
|
||||
extends ReadableStore[Set[Long], DataRecord] {
|
||||
|
||||
override def multiGet[K <: Set[Long]](keys: Set[K]): Map[K, Future[Option[DataRecord]]] = {
|
||||
val orderedKeys: Seq[K] = keys.toSeq
|
||||
val featureStoreRequests: Seq[FeatureStoreRequest] = getFeatureStoreRequests(orderedKeys)
|
||||
val predictionRecordsFut: Future[Seq[PredictionRecord]] = featureStoreClient(
|
||||
featureStoreRequests)
|
||||
|
||||
getDataRecordMap(orderedKeys, predictionRecordsFut)
|
||||
}
|
||||
|
||||
private def getFeatureStoreRequests[K <: Set[Long]](
|
||||
orderedKeys: Seq[K]
|
||||
): Seq[FeatureStoreRequest] = {
|
||||
orderedKeys.map { key: Set[Long] =>
|
||||
FeatureStoreRequest(
|
||||
entityIds = key.map { tweetId => tweetEntity.withId(TweetId(tweetId)) }.toSeq
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
private def getDataRecordMap[K <: Set[Long]](
|
||||
orderedKeys: Seq[K],
|
||||
predictionRecordsFut: Future[Seq[PredictionRecord]]
|
||||
): Map[K, Future[Option[DataRecord]]] = {
|
||||
orderedKeys.zipWithIndex.map {
|
||||
case (tweetIdSet, index) =>
|
||||
val dataRecordFutOpt: Future[Option[DataRecord]] = predictionRecordsFut.map {
|
||||
predictionRecords =>
|
||||
predictionRecords.lift(index).flatMap { predictionRecordAtIndex: PredictionRecord =>
|
||||
tweetFeaturesAdapter.adaptToDataRecords(predictionRecordAtIndex).asScala.headOption
|
||||
}
|
||||
}
|
||||
(tweetIdSet, dataRecordFutOpt)
|
||||
}.toMap
|
||||
}
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates.real_time
|
||||
|
||||
import com.twitter.summingbird_internal.runner.storm.GenericRunner
|
||||
|
||||
object TypeSafeRunner {
|
||||
def main(args: Array[String]): Unit = GenericRunner(args, TimelinesRealTimeAggregatesJob(_))
|
||||
}
|
|
@ -0,0 +1,108 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates.real_time
|
||||
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType.InferredGender
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType.UserState
|
||||
import com.twitter.ml.api.Feature.Binary
|
||||
import com.twitter.ml.api.Feature.Text
|
||||
import com.twitter.ml.api.DataRecord
|
||||
import com.twitter.ml.api.Feature
|
||||
import com.twitter.ml.api.FeatureContext
|
||||
import com.twitter.ml.api.RichDataRecord
|
||||
import com.twitter.ml.featurestore.catalog.entities.core.User
|
||||
import com.twitter.ml.featurestore.catalog.features.core.UserAccount
|
||||
import com.twitter.ml.featurestore.catalog.features.geo.UserLocation
|
||||
import com.twitter.ml.featurestore.catalog.features.magicrecs.UserActivity
|
||||
import com.twitter.ml.featurestore.lib.EntityId
|
||||
import com.twitter.ml.featurestore.lib.data.PredictionRecord
|
||||
import com.twitter.ml.featurestore.lib.feature.BoundFeature
|
||||
import com.twitter.ml.featurestore.lib.feature.BoundFeatureSet
|
||||
import com.twitter.ml.featurestore.lib.UserId
|
||||
import com.twitter.ml.featurestore.lib.{Discrete => FSDiscrete}
|
||||
import com.twitter.timelines.prediction.common.adapters.TimelinesAdapterBase
|
||||
import com.twitter.timelines.prediction.features.user_health.UserHealthFeatures
|
||||
import java.lang.{Boolean => JBoolean}
|
||||
import java.lang.{String => JString}
|
||||
import java.util
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object UserFeaturesAdapter extends TimelinesAdapterBase[PredictionRecord] {
|
||||
val UserStateBoundFeature: BoundFeature[UserId, FSDiscrete] = UserActivity.UserState.bind(User)
|
||||
|
||||
/**
|
||||
* Boolean features about viewer's user state.
|
||||
* enum UserState {
|
||||
* NEW = 0,
|
||||
* NEAR_ZERO = 1,
|
||||
* VERY_LIGHT = 2,
|
||||
* LIGHT = 3,
|
||||
* MEDIUM_TWEETER = 4,
|
||||
* MEDIUM_NON_TWEETER = 5,
|
||||
* HEAVY_NON_TWEETER = 6,
|
||||
* HEAVY_TWEETER = 7
|
||||
* }(persisted='true')
|
||||
*/
|
||||
val IS_USER_NEW = new Binary("timelines.user_state.is_user_new", Set(UserState).asJava)
|
||||
val IS_USER_LIGHT = new Binary("timelines.user_state.is_user_light", Set(UserState).asJava)
|
||||
val IS_USER_MEDIUM_TWEETER =
|
||||
new Binary("timelines.user_state.is_user_medium_tweeter", Set(UserState).asJava)
|
||||
val IS_USER_MEDIUM_NON_TWEETER =
|
||||
new Binary("timelines.user_state.is_user_medium_non_tweeter", Set(UserState).asJava)
|
||||
val IS_USER_HEAVY_NON_TWEETER =
|
||||
new Binary("timelines.user_state.is_user_heavy_non_tweeter", Set(UserState).asJava)
|
||||
val IS_USER_HEAVY_TWEETER =
|
||||
new Binary("timelines.user_state.is_user_heavy_tweeter", Set(UserState).asJava)
|
||||
val userStateToFeatureMap: Map[Long, Binary] = Map(
|
||||
0L -> IS_USER_NEW,
|
||||
1L -> IS_USER_LIGHT,
|
||||
2L -> IS_USER_LIGHT,
|
||||
3L -> IS_USER_LIGHT,
|
||||
4L -> IS_USER_MEDIUM_TWEETER,
|
||||
5L -> IS_USER_MEDIUM_NON_TWEETER,
|
||||
6L -> IS_USER_HEAVY_NON_TWEETER,
|
||||
7L -> IS_USER_HEAVY_TWEETER
|
||||
)
|
||||
|
||||
val UserStateBooleanFeatures: Set[Feature[_]] = userStateToFeatureMap.values.toSet
|
||||
|
||||
|
||||
val USER_COUNTRY_ID = new Text("geo.user_location.country_code")
|
||||
val UserCountryCodeFeature: BoundFeature[UserId, String] =
|
||||
UserLocation.CountryCodeAlpha2.bind(User)
|
||||
val UserLocationFeatures: Set[Feature[_]] = Set(USER_COUNTRY_ID)
|
||||
|
||||
private val UserVerifiedFeaturesSet = Set(
|
||||
UserAccount.IsUserVerified.bind(User),
|
||||
UserAccount.IsUserBlueVerified.bind(User),
|
||||
UserAccount.IsUserGoldVerified.bind(User),
|
||||
UserAccount.IsUserGrayVerified.bind(User)
|
||||
)
|
||||
|
||||
val UserFeaturesSet: BoundFeatureSet =
|
||||
BoundFeatureSet(UserStateBoundFeature, UserCountryCodeFeature) ++
|
||||
BoundFeatureSet(UserVerifiedFeaturesSet.asInstanceOf[Set[BoundFeature[_ <: EntityId, _]]])
|
||||
|
||||
private val allFeatures: Seq[Feature[_]] =
|
||||
UserStateBooleanFeatures.toSeq ++ GenderBooleanFeatures.toSeq ++
|
||||
UserLocationFeatures.toSeq ++ Seq(UserHealthFeatures.IsUserVerifiedUnion)
|
||||
|
||||
override def getFeatureContext: FeatureContext = new FeatureContext(allFeatures: _*)
|
||||
override def commonFeatures: Set[Feature[_]] = Set.empty
|
||||
|
||||
override def adaptToDataRecords(record: PredictionRecord): util.List[DataRecord] = {
|
||||
val newRecord = new RichDataRecord(new DataRecord)
|
||||
record
|
||||
.getFeatureValue(UserStateBoundFeature)
|
||||
.flatMap { userState => userStateToFeatureMap.get(userState.value) }.foreach {
|
||||
booleanFeature => newRecord.setFeatureValue[JBoolean](booleanFeature, true)
|
||||
}
|
||||
record.getFeatureValue(UserCountryCodeFeature).foreach { countryCodeFeatureValue =>
|
||||
newRecord.setFeatureValue[JString](USER_COUNTRY_ID, countryCodeFeatureValue)
|
||||
}
|
||||
|
||||
val isUserVerifiedUnion =
|
||||
UserVerifiedFeaturesSet.exists(feature => record.getFeatureValue(feature).getOrElse(false))
|
||||
newRecord.setFeatureValue[JBoolean](UserHealthFeatures.IsUserVerifiedUnion, isUserVerifiedUnion)
|
||||
|
||||
List(newRecord.getRecord).asJava
|
||||
}
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
package com.twitter.timelines.prediction.common.aggregates.real_time
|
||||
|
||||
import com.twitter.ml.api.DataRecord
|
||||
import com.twitter.ml.featurestore.lib.UserId
|
||||
import com.twitter.ml.featurestore.lib.data.PredictionRecord
|
||||
import com.twitter.ml.featurestore.lib.entity.Entity
|
||||
import com.twitter.ml.featurestore.lib.online.{FeatureStoreClient, FeatureStoreRequest}
|
||||
import com.twitter.storehaus.ReadableStore
|
||||
import com.twitter.timelines.prediction.common.adapters.TimelinesAdapterBase
|
||||
import com.twitter.util.Future
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
class UserFeaturesReadableStore(
|
||||
featureStoreClient: FeatureStoreClient,
|
||||
userEntity: Entity[UserId],
|
||||
userFeaturesAdapter: TimelinesAdapterBase[PredictionRecord])
|
||||
extends ReadableStore[Set[Long], DataRecord] {
|
||||
|
||||
override def multiGet[K <: Set[Long]](keys: Set[K]): Map[K, Future[Option[DataRecord]]] = {
|
||||
val orderedKeys = keys.toSeq
|
||||
val featureStoreRequests: Seq[FeatureStoreRequest] = orderedKeys.map { key: Set[Long] =>
|
||||
FeatureStoreRequest(
|
||||
entityIds = key.map(userId => userEntity.withId(UserId(userId))).toSeq
|
||||
)
|
||||
}
|
||||
val predictionRecordsFut: Future[Seq[PredictionRecord]] = featureStoreClient(
|
||||
featureStoreRequests)
|
||||
|
||||
orderedKeys.zipWithIndex.map {
|
||||
case (userId, index) =>
|
||||
val dataRecordFutOpt = predictionRecordsFut.map { predictionRecords =>
|
||||
userFeaturesAdapter.adaptToDataRecords(predictionRecords(index)).asScala.headOption
|
||||
}
|
||||
(userId, dataRecordFutOpt)
|
||||
}.toMap
|
||||
}
|
||||
}
|
|
@ -0,0 +1,6 @@
|
|||
## Prediction Features
|
||||
|
||||
This directory contains a collection of `Features` (`com.twitter.ml.api.Feature`) which are definitions of feature names and datatypes which allow the features to be efficiently processed and passed to the different ranking models.
|
||||
By predefining the features with their names and datatypes, when features are being generated, scribed or used to score they can be identified with only a hash of their name.
|
||||
|
||||
Not all of these features are used in the model, many are experimental or deprecated.
|
|
@ -0,0 +1,11 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/scala/com/twitter/suggests/controller_data",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
"src/thrift/com/twitter/timelineservice/server/suggests/logging:thrift-scala",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,169 @@
|
|||
package com.twitter.timelines.prediction.features.client_log_event
|
||||
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType._
|
||||
import com.twitter.ml.api.Feature
|
||||
import com.twitter.ml.api.Feature.Binary
|
||||
import com.twitter.ml.api.Feature.Continuous
|
||||
import com.twitter.ml.api.Feature.Discrete
|
||||
import scala.collection.JavaConverters._
|
||||
import com.twitter.timelineservice.suggests.logging.candidate_tweet_source_id.thriftscala.CandidateTweetSourceId
|
||||
|
||||
object ClientLogEventDataRecordFeatures {
|
||||
val HasConsumerVideo = new Binary(
|
||||
"client_log_event.tweet.has_consumer_video",
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val PhotoCount = new Continuous(
|
||||
"client_log_event.tweet.photo_count",
|
||||
Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava)
|
||||
val HasImage = new Binary(
|
||||
"client_log_event.tweet.has_image",
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val IsReply =
|
||||
new Binary("client_log_event.tweet.is_reply", Set(PublicReplies, PrivateReplies).asJava)
|
||||
val IsRetweet =
|
||||
new Binary("client_log_event.tweet.is_retweet", Set(PublicRetweets, PrivateRetweets).asJava)
|
||||
val IsPromoted =
|
||||
new Binary(
|
||||
"client_log_event.tweet.is_promoted",
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HasVisibleLink = new Binary(
|
||||
"client_log_event.tweet.has_visible_link",
|
||||
Set(UrlFoundFlag, PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HasHashtag = new Binary(
|
||||
"client_log_event.tweet.has_hashtag",
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val FromMutualFollow = new Binary("client_log_event.tweet.from_mutual_follow")
|
||||
val IsInNetwork = new Binary("client_log_event.tweet.is_in_network")
|
||||
val IsNotInNetwork = new Binary("client_log_event.tweet.is_not_in_network")
|
||||
val FromRecap = new Binary("client_log_event.tweet.from_recap")
|
||||
val FromRecycled = new Binary("client_log_event.tweet.from_recycled")
|
||||
val FromActivity = new Binary("client_log_event.tweet.from_activity")
|
||||
val FromSimcluster = new Binary("client_log_event.tweet.from_simcluster")
|
||||
val FromErg = new Binary("client_log_event.tweet.from_erg")
|
||||
val FromCroon = new Binary("client_log_event.tweet.from_croon")
|
||||
val FromList = new Binary("client_log_event.tweet.from_list")
|
||||
val FromRecTopic = new Binary("client_log_event.tweet.from_rec_topic")
|
||||
val InjectedPosition = new Discrete("client_log_event.tweet.injectedPosition")
|
||||
val TextOnly = new Binary("client_log_event.tweet.text_only")
|
||||
val HasLikedBySocialContext = new Binary("client_log_event.tweet.has_liked_by_social_context")
|
||||
val HasFollowedBySocialContext = new Binary(
|
||||
"client_log_event.tweet.has_followed_by_social_context")
|
||||
val HasTopicSocialContext = new Binary("client_log_event.tweet.has_topic_social_context")
|
||||
val IsFollowedTopicTweet = new Binary("client_log_event.tweet.is_followed_topic_tweet")
|
||||
val IsRecommendedTopicTweet = new Binary("client_log_event.tweet.is_recommended_topic_tweet")
|
||||
val IsTweetAgeLessThan15Seconds = new Binary(
|
||||
"client_log_event.tweet.tweet_age_less_than_15_seconds")
|
||||
val IsTweetAgeLessThanOrEqualTo30Minutes = new Binary(
|
||||
"client_log_event.tweet.tweet_age_lte_30_minutes")
|
||||
val IsTweetAgeLessThanOrEqualTo1Hour = new Binary("client_log_event.tweet.tweet_age_lte_1_hour")
|
||||
val IsTweetAgeLessThanOrEqualTo6Hours = new Binary("client_log_event.tweet.tweet_age_lte_6_hours")
|
||||
val IsTweetAgeLessThanOrEqualTo12Hours = new Binary(
|
||||
"client_log_event.tweet.tweet_age_lte_12_hours")
|
||||
val IsTweetAgeGreaterThanOrEqualTo24Hours = new Binary(
|
||||
"client_log_event.tweet.tweet_age_gte_24_hours")
|
||||
val HasGreaterThanOrEqualTo100Favs = new Binary("client_log_event.tweet.has_gte_100_favs")
|
||||
val HasGreaterThanOrEqualTo1KFavs = new Binary("client_log_event.tweet.has_gte_1k_favs")
|
||||
val HasGreaterThanOrEqualTo10KFavs = new Binary("client_log_event.tweet.has_gte_10k_favs")
|
||||
val HasGreaterThanOrEqualTo100KFavs = new Binary("client_log_event.tweet.has_gte_100k_favs")
|
||||
val HasGreaterThanOrEqualTo10Retweets = new Binary("client_log_event.tweet.has_gte_10_retweets")
|
||||
val HasGreaterThanOrEqualTo100Retweets = new Binary("client_log_event.tweet.has_gte_100_retweets")
|
||||
val HasGreaterThanOrEqualTo1KRetweets = new Binary("client_log_event.tweet.has_gte_1k_retweets")
|
||||
|
||||
val TweetTypeToFeatureMap: Map[String, Binary] = Map(
|
||||
"link" -> HasVisibleLink,
|
||||
"hashtag" -> HasHashtag,
|
||||
"mutual_follow" -> FromMutualFollow,
|
||||
"in_network" -> IsInNetwork,
|
||||
"text_only" -> TextOnly,
|
||||
"has_liked_by_social_context" -> HasLikedBySocialContext,
|
||||
"has_followed_by_social_context" -> HasFollowedBySocialContext,
|
||||
"has_topic_social_context" -> HasTopicSocialContext,
|
||||
"is_followed_topic_tweet" -> IsFollowedTopicTweet,
|
||||
"is_recommended_topic_tweet" -> IsRecommendedTopicTweet,
|
||||
"tweet_age_less_than_15_seconds" -> IsTweetAgeLessThan15Seconds,
|
||||
"tweet_age_lte_30_minutes" -> IsTweetAgeLessThanOrEqualTo30Minutes,
|
||||
"tweet_age_lte_1_hour" -> IsTweetAgeLessThanOrEqualTo1Hour,
|
||||
"tweet_age_lte_6_hours" -> IsTweetAgeLessThanOrEqualTo6Hours,
|
||||
"tweet_age_lte_12_hours" -> IsTweetAgeLessThanOrEqualTo12Hours,
|
||||
"tweet_age_gte_24_hours" -> IsTweetAgeGreaterThanOrEqualTo24Hours,
|
||||
"has_gte_100_favs" -> HasGreaterThanOrEqualTo100Favs,
|
||||
"has_gte_1k_favs" -> HasGreaterThanOrEqualTo1KFavs,
|
||||
"has_gte_10k_favs" -> HasGreaterThanOrEqualTo10KFavs,
|
||||
"has_gte_100k_favs" -> HasGreaterThanOrEqualTo100KFavs,
|
||||
"has_gte_10_retweets" -> HasGreaterThanOrEqualTo10Retweets,
|
||||
"has_gte_100_retweets" -> HasGreaterThanOrEqualTo100Retweets,
|
||||
"has_gte_1k_retweets" -> HasGreaterThanOrEqualTo1KRetweets
|
||||
)
|
||||
|
||||
val CandidateTweetSourceIdFeatureMap: Map[Int, Binary] = Map(
|
||||
CandidateTweetSourceId.RecapTweet.value -> FromRecap,
|
||||
CandidateTweetSourceId.RecycledTweet.value -> FromRecycled,
|
||||
CandidateTweetSourceId.RecommendedTweet.value -> FromActivity,
|
||||
CandidateTweetSourceId.Simcluster.value -> FromSimcluster,
|
||||
CandidateTweetSourceId.ErgTweet.value -> FromErg,
|
||||
CandidateTweetSourceId.CroonTopicTweet.value -> FromCroon,
|
||||
CandidateTweetSourceId.CroonTweet.value -> FromCroon,
|
||||
CandidateTweetSourceId.ListTweet.value -> FromList,
|
||||
CandidateTweetSourceId.RecommendedTopicTweet.value -> FromRecTopic
|
||||
)
|
||||
|
||||
val TweetFeaturesV2: Set[Feature[_]] = Set(
|
||||
HasImage,
|
||||
IsReply,
|
||||
IsRetweet,
|
||||
HasVisibleLink,
|
||||
HasHashtag,
|
||||
FromMutualFollow,
|
||||
IsInNetwork
|
||||
)
|
||||
|
||||
val ContentTweetTypeFeatures: Set[Feature[_]] = Set(
|
||||
HasImage,
|
||||
HasVisibleLink,
|
||||
HasHashtag,
|
||||
TextOnly,
|
||||
HasVisibleLink
|
||||
)
|
||||
|
||||
val FreshnessTweetTypeFeatures: Set[Feature[_]] = Set(
|
||||
IsTweetAgeLessThan15Seconds,
|
||||
IsTweetAgeLessThanOrEqualTo30Minutes,
|
||||
IsTweetAgeLessThanOrEqualTo1Hour,
|
||||
IsTweetAgeLessThanOrEqualTo6Hours,
|
||||
IsTweetAgeLessThanOrEqualTo12Hours,
|
||||
IsTweetAgeGreaterThanOrEqualTo24Hours
|
||||
)
|
||||
|
||||
val SocialProofTweetTypeFeatures: Set[Feature[_]] = Set(
|
||||
HasLikedBySocialContext,
|
||||
HasFollowedBySocialContext,
|
||||
HasTopicSocialContext
|
||||
)
|
||||
|
||||
val TopicTweetPreferenceTweetTypeFeatures: Set[Feature[_]] = Set(
|
||||
IsFollowedTopicTweet,
|
||||
IsRecommendedTopicTweet
|
||||
)
|
||||
|
||||
val TweetPopularityTweetTypeFeatures: Set[Feature[_]] = Set(
|
||||
HasGreaterThanOrEqualTo100Favs,
|
||||
HasGreaterThanOrEqualTo1KFavs,
|
||||
HasGreaterThanOrEqualTo10KFavs,
|
||||
HasGreaterThanOrEqualTo100KFavs,
|
||||
HasGreaterThanOrEqualTo10Retweets,
|
||||
HasGreaterThanOrEqualTo100Retweets,
|
||||
HasGreaterThanOrEqualTo1KRetweets
|
||||
)
|
||||
|
||||
val UserGraphInteractionTweetTypeFeatures: Set[Feature[_]] = Set(
|
||||
IsInNetwork,
|
||||
FromMutualFollow,
|
||||
IsNotInNetwork,
|
||||
IsPromoted
|
||||
)
|
||||
|
||||
val UserContentPreferenceTweetTypeFeatures: Set[Feature[_]] =
|
||||
ContentTweetTypeFeatures ++ FreshnessTweetTypeFeatures ++ SocialProofTweetTypeFeatures ++ TopicTweetPreferenceTweetTypeFeatures ++ TweetPopularityTweetTypeFeatures ++ UserGraphInteractionTweetTypeFeatures
|
||||
val AuthorContentPreferenceTweetTypeFeatures: Set[Feature[_]] =
|
||||
Set(IsInNetwork, FromMutualFollow, IsNotInNetwork) ++ ContentTweetTypeFeatures
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
"src/thrift/com/twitter/ml/api:data-java",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,536 @@
|
|||
package com.twitter.timelines.prediction.features.common
|
||||
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType._
|
||||
import com.twitter.ml.api.Feature
|
||||
import com.twitter.ml.api.FeatureType
|
||||
import com.twitter.ml.api.Feature.Binary
|
||||
import java.lang.{Boolean => JBoolean}
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object CombinedFeatures {
|
||||
val IS_CLICKED =
|
||||
new Binary("timelines.engagement.is_clicked", Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_DWELLED =
|
||||
new Binary("timelines.engagement.is_dwelled", Set(TweetsViewed, EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_IN_BOUNDS_V1 = new Binary(
|
||||
"timelines.engagement.is_dwelled_in_bounds_v1",
|
||||
Set(TweetsViewed, EngagementsPrivate).asJava)
|
||||
val IS_FAVORITED = new Binary(
|
||||
"timelines.engagement.is_favorited",
|
||||
Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_FOLLOWED = new Binary(
|
||||
"timelines.engagement.is_followed",
|
||||
Set(EngagementsPrivate, EngagementsPublic, Follow).asJava)
|
||||
val IS_IMPRESSED =
|
||||
new Binary("timelines.engagement.is_impressed", Set(TweetsViewed, EngagementsPrivate).asJava)
|
||||
val IS_OPEN_LINKED = new Binary(
|
||||
"timelines.engagement.is_open_linked",
|
||||
Set(EngagementsPrivate, LinksClickedOn).asJava)
|
||||
val IS_PHOTO_EXPANDED = new Binary(
|
||||
"timelines.engagement.is_photo_expanded",
|
||||
Set(MediaEngagementActivities, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_CLICKED = new Binary(
|
||||
"timelines.engagement.is_profile_clicked",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_QUOTED = new Binary(
|
||||
"timelines.engagement.is_quoted",
|
||||
Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_REPLIED = new Binary(
|
||||
"timelines.engagement.is_replied",
|
||||
Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_RETWEETED = new Binary(
|
||||
"timelines.engagement.is_retweeted",
|
||||
Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_RETWEETED_WITHOUT_QUOTE = new Binary(
|
||||
"timelines.enagagement.is_retweeted_without_quote",
|
||||
Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_SHARE_DM_CLICKED =
|
||||
new Binary("timelines.engagement.is_tweet_share_dm_clicked", Set(EngagementsPrivate).asJava)
|
||||
val IS_SHARE_DM_SENT =
|
||||
new Binary("timelines.engagement.is_tweet_share_dm_sent", Set(EngagementsPrivate).asJava)
|
||||
val IS_VIDEO_PLAYBACK_25 = new Binary(
|
||||
"timelines.engagement.is_video_playback_25",
|
||||
Set(MediaEngagementActivities, EngagementsPrivate).asJava)
|
||||
val IS_VIDEO_PLAYBACK_50 = new Binary(
|
||||
"timelines.engagement.is_video_playback_50",
|
||||
Set(MediaEngagementActivities, EngagementsPrivate).asJava)
|
||||
val IS_VIDEO_PLAYBACK_75 = new Binary(
|
||||
"timelines.engagement.is_video_playback_75",
|
||||
Set(MediaEngagementActivities, EngagementsPrivate).asJava)
|
||||
val IS_VIDEO_PLAYBACK_95 = new Binary(
|
||||
"timelines.engagement.is_video_playback_95",
|
||||
Set(MediaEngagementActivities, EngagementsPrivate).asJava)
|
||||
val IS_VIDEO_PLAYBACK_COMPLETE = new Binary(
|
||||
"timelines.engagement.is_video_playback_complete",
|
||||
Set(MediaEngagementActivities, EngagementsPrivate).asJava)
|
||||
val IS_VIDEO_PLAYBACK_START = new Binary(
|
||||
"timelines.engagement.is_video_playback_start",
|
||||
Set(MediaEngagementActivities, EngagementsPrivate).asJava)
|
||||
val IS_VIDEO_VIEWED = new Binary(
|
||||
"timelines.engagement.is_video_viewed",
|
||||
Set(MediaEngagementActivities, EngagementsPrivate).asJava)
|
||||
val IS_VIDEO_QUALITY_VIEWED = new Binary(
|
||||
"timelines.engagement.is_video_quality_viewed",
|
||||
Set(MediaEngagementActivities, EngagementsPrivate).asJava
|
||||
)
|
||||
// v1: post click engagements: fav, reply
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_V1 = new Binary(
|
||||
"timelines.engagement.is_good_clicked_convo_desc_favorited_or_replied",
|
||||
Set(
|
||||
TweetsClicked,
|
||||
PublicLikes,
|
||||
PrivateLikes,
|
||||
PublicReplies,
|
||||
PrivateReplies,
|
||||
EngagementsPrivate,
|
||||
EngagementsPublic).asJava)
|
||||
// v2: post click engagements: click
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_V2 = new Binary(
|
||||
"timelines.engagement.is_good_clicked_convo_desc_v2",
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_GOOD_CLICKED_WITH_DWELL_SUM_GTE_60S = new Binary(
|
||||
"timelines.engagement.is_good_clicked_convo_desc_favorited_or_replied_or_dwell_sum_gte_60_secs",
|
||||
Set(
|
||||
TweetsClicked,
|
||||
PublicLikes,
|
||||
PrivateLikes,
|
||||
PublicReplies,
|
||||
PrivateReplies,
|
||||
EngagementsPrivate,
|
||||
EngagementsPublic).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_FAVORITED = new Binary(
|
||||
"timelines.engagement.is_good_clicked_convo_desc_favorited",
|
||||
Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_REPLIED = new Binary(
|
||||
"timelines.engagement.is_good_clicked_convo_desc_replied",
|
||||
Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_RETWEETED = new Binary(
|
||||
"timelines.engagement.is_good_clicked_convo_desc_retweeted",
|
||||
Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_CLICKED = new Binary(
|
||||
"timelines.engagement.is_good_clicked_convo_desc_clicked",
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_FOLLOWED = new Binary(
|
||||
"timelines.engagement.is_good_clicked_convo_desc_followed",
|
||||
Set(EngagementsPrivate).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_SHARE_DM_CLICKED = new Binary(
|
||||
"timelines.engagement.is_good_clicked_convo_desc_share_dm_clicked",
|
||||
Set(EngagementsPrivate).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_PROFILE_CLICKED = new Binary(
|
||||
"timelines.engagement.is_good_clicked_convo_desc_profile_clicked",
|
||||
Set(EngagementsPrivate).asJava)
|
||||
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_0 = new Binary(
|
||||
"timelines.engagement.is_good_clicked_convo_desc_uam_gt_0",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_1 = new Binary(
|
||||
"timelines.engagement.is_good_clicked_convo_desc_uam_gt_1",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_2 = new Binary(
|
||||
"timelines.engagement.is_good_clicked_convo_desc_uam_gt_2",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_3 = new Binary(
|
||||
"timelines.engagement.is_good_clicked_convo_desc_uam_gt_3",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
|
||||
val IS_TWEET_DETAIL_DWELLED = new Binary(
|
||||
"timelines.engagement.is_tweet_detail_dwelled",
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_TWEET_DETAIL_DWELLED_8_SEC = new Binary(
|
||||
"timelines.engagement.is_tweet_detail_dwelled_8_sec",
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_TWEET_DETAIL_DWELLED_15_SEC = new Binary(
|
||||
"timelines.engagement.is_tweet_detail_dwelled_15_sec",
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_TWEET_DETAIL_DWELLED_25_SEC = new Binary(
|
||||
"timelines.engagement.is_tweet_detail_dwelled_25_sec",
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_TWEET_DETAIL_DWELLED_30_SEC = new Binary(
|
||||
"timelines.engagement.is_tweet_detail_dwelled_30_sec",
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_PROFILE_DWELLED = new Binary(
|
||||
"timelines.engagement.is_profile_dwelled",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_DWELLED_10_SEC = new Binary(
|
||||
"timelines.engagement.is_profile_dwelled_10_sec",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_DWELLED_20_SEC = new Binary(
|
||||
"timelines.engagement.is_profile_dwelled_20_sec",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_DWELLED_30_SEC = new Binary(
|
||||
"timelines.engagement.is_profile_dwelled_30_sec",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_FULLSCREEN_VIDEO_DWELLED = new Binary(
|
||||
"timelines.engagement.is_fullscreen_video_dwelled",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_FULLSCREEN_VIDEO_DWELLED_5_SEC = new Binary(
|
||||
"timelines.engagement.is_fullscreen_video_dwelled_5_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_FULLSCREEN_VIDEO_DWELLED_10_SEC = new Binary(
|
||||
"timelines.engagement.is_fullscreen_video_dwelled_10_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_FULLSCREEN_VIDEO_DWELLED_20_SEC = new Binary(
|
||||
"timelines.engagement.is_fullscreen_video_dwelled_20_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_FULLSCREEN_VIDEO_DWELLED_30_SEC = new Binary(
|
||||
"timelines.engagement.is_fullscreen_video_dwelled_30_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_LINK_DWELLED_15_SEC = new Binary(
|
||||
"timelines.engagement.is_link_dwelled_15_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_LINK_DWELLED_30_SEC = new Binary(
|
||||
"timelines.engagement.is_link_dwelled_30_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_LINK_DWELLED_60_SEC = new Binary(
|
||||
"timelines.engagement.is_link_dwelled_60_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_HOME_LATEST_VISITED =
|
||||
new Binary("timelines.engagement.is_home_latest_visited", Set(EngagementsPrivate).asJava)
|
||||
|
||||
val IS_BOOKMARKED =
|
||||
new Binary("timelines.engagement.is_bookmarked", Set(EngagementsPrivate).asJava)
|
||||
val IS_SHARED =
|
||||
new Binary("timelines.engagement.is_shared", Set(EngagementsPrivate).asJava)
|
||||
val IS_SHARE_MENU_CLICKED =
|
||||
new Binary("timelines.engagement.is_share_menu_clicked", Set(EngagementsPrivate).asJava)
|
||||
|
||||
// Negative engagements
|
||||
val IS_DONT_LIKE = new Binary("timelines.engagement.is_dont_like", Set(EngagementsPrivate).asJava)
|
||||
val IS_BLOCK_CLICKED = new Binary(
|
||||
"timelines.engagement.is_block_clicked",
|
||||
Set(Blocks, TweetsClicked, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_BLOCK_DIALOG_BLOCKED = new Binary(
|
||||
"timelines.engagement.is_block_dialog_blocked",
|
||||
Set(Blocks, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_MUTE_CLICKED = new Binary(
|
||||
"timelines.engagement.is_mute_clicked",
|
||||
Set(Mutes, TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_MUTE_DIALOG_MUTED =
|
||||
new Binary("timelines.engagement.is_mute_dialog_muted", Set(Mutes, EngagementsPrivate).asJava)
|
||||
val IS_REPORT_TWEET_CLICKED = new Binary(
|
||||
"timelines.engagement.is_report_tweet_clicked",
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_CARET_CLICKED =
|
||||
new Binary("timelines.engagement.is_caret_clicked", Set(EngagementsPrivate).asJava)
|
||||
val IS_NOT_ABOUT_TOPIC =
|
||||
new Binary("timelines.engagement.is_not_about_topic", Set(EngagementsPrivate).asJava)
|
||||
val IS_NOT_RECENT =
|
||||
new Binary("timelines.engagement.is_not_recent", Set(EngagementsPrivate).asJava)
|
||||
val IS_NOT_RELEVANT =
|
||||
new Binary("timelines.engagement.is_not_relevant", Set(EngagementsPrivate).asJava)
|
||||
val IS_SEE_FEWER =
|
||||
new Binary("timelines.engagement.is_see_fewer", Set(EngagementsPrivate).asJava)
|
||||
val IS_UNFOLLOW_TOPIC =
|
||||
new Binary("timelines.engagement.is_unfollow_topic", Set(EngagementsPrivate).asJava)
|
||||
val IS_FOLLOW_TOPIC =
|
||||
new Binary("timelines.engagement.is_follow_topic", Set(EngagementsPrivate).asJava)
|
||||
val IS_NOT_INTERESTED_IN_TOPIC =
|
||||
new Binary("timelines.engagement.is_not_interested_in_topic", Set(EngagementsPrivate).asJava)
|
||||
val IS_NEGATIVE_FEEDBACK =
|
||||
new Binary("timelines.engagement.is_negative_feedback", Set(EngagementsPrivate).asJava)
|
||||
val IS_IMPLICIT_POSITIVE_FEEDBACK_UNION =
|
||||
new Binary(
|
||||
"timelines.engagement.is_implicit_positive_feedback_union",
|
||||
Set(EngagementsPrivate).asJava)
|
||||
val IS_EXPLICIT_POSITIVE_FEEDBACK_UNION =
|
||||
new Binary(
|
||||
"timelines.engagement.is_explicit_positive_feedback_union",
|
||||
Set(EngagementsPrivate).asJava)
|
||||
val IS_ALL_NEGATIVE_FEEDBACK_UNION =
|
||||
new Binary(
|
||||
"timelines.engagement.is_all_negative_feedback_union",
|
||||
Set(EngagementsPrivate).asJava)
|
||||
// Reciprocal engagements for reply forward engagement
|
||||
val IS_REPLIED_REPLY_IMPRESSED_BY_AUTHOR = new Binary(
|
||||
"timelines.engagement.is_replied_reply_impressed_by_author",
|
||||
Set(EngagementsPrivate).asJava)
|
||||
val IS_REPLIED_REPLY_FAVORITED_BY_AUTHOR = new Binary(
|
||||
"timelines.engagement.is_replied_reply_favorited_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic, PrivateLikes, PublicLikes).asJava)
|
||||
val IS_REPLIED_REPLY_QUOTED_BY_AUTHOR = new Binary(
|
||||
"timelines.engagement.is_replied_reply_quoted_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava)
|
||||
val IS_REPLIED_REPLY_REPLIED_BY_AUTHOR = new Binary(
|
||||
"timelines.engagement.is_replied_reply_replied_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic, PrivateReplies, PublicReplies).asJava)
|
||||
val IS_REPLIED_REPLY_RETWEETED_BY_AUTHOR = new Binary(
|
||||
"timelines.engagement.is_replied_reply_retweeted_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava)
|
||||
val IS_REPLIED_REPLY_BLOCKED_BY_AUTHOR = new Binary(
|
||||
"timelines.engagement.is_replied_reply_blocked_by_author",
|
||||
Set(Blocks, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_REPLIED_REPLY_FOLLOWED_BY_AUTHOR = new Binary(
|
||||
"timelines.engagement.is_replied_reply_followed_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic, Follow).asJava)
|
||||
val IS_REPLIED_REPLY_UNFOLLOWED_BY_AUTHOR = new Binary(
|
||||
"timelines.engagement.is_replied_reply_unfollowed_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_REPLIED_REPLY_MUTED_BY_AUTHOR = new Binary(
|
||||
"timelines.engagement.is_replied_reply_muted_by_author",
|
||||
Set(Mutes, EngagementsPrivate).asJava)
|
||||
val IS_REPLIED_REPLY_REPORTED_BY_AUTHOR = new Binary(
|
||||
"timelines.engagement.is_replied_reply_reported_by_author",
|
||||
Set(EngagementsPrivate).asJava)
|
||||
|
||||
// Reciprocal engagements for fav forward engagement
|
||||
val IS_FAVORITED_FAV_FAVORITED_BY_AUTHOR = new Binary(
|
||||
"timelines.engagement.is_favorited_fav_favorited_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic, PrivateLikes, PublicLikes).asJava
|
||||
)
|
||||
val IS_FAVORITED_FAV_REPLIED_BY_AUTHOR = new Binary(
|
||||
"timelines.engagement.is_favorited_fav_replied_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic, PrivateReplies, PublicReplies).asJava
|
||||
)
|
||||
val IS_FAVORITED_FAV_RETWEETED_BY_AUTHOR = new Binary(
|
||||
"timelines.engagement.is_favorited_fav_retweeted_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava
|
||||
)
|
||||
val IS_FAVORITED_FAV_FOLLOWED_BY_AUTHOR = new Binary(
|
||||
"timelines.engagement.is_favorited_fav_followed_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava
|
||||
)
|
||||
|
||||
// define good profile click by considering following engagements (follow, fav, reply, retweet, etc.) at profile page
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_FOLLOW = new Binary(
|
||||
"timelines.engagement.is_profile_clicked_and_profile_follow",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, Follow).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_FAV = new Binary(
|
||||
"timelines.engagement.is_profile_clicked_and_profile_fav",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, PrivateLikes, PublicLikes).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_REPLY = new Binary(
|
||||
"timelines.engagement.is_profile_clicked_and_profile_reply",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, PrivateReplies, PublicReplies).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_RETWEET = new Binary(
|
||||
"timelines.engagement.is_profile_clicked_and_profile_retweet",
|
||||
Set(
|
||||
ProfilesViewed,
|
||||
ProfilesClicked,
|
||||
EngagementsPrivate,
|
||||
PrivateRetweets,
|
||||
PublicRetweets).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_TWEET_CLICK = new Binary(
|
||||
"timelines.engagement.is_profile_clicked_and_profile_tweet_click",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, TweetsClicked).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_SHARE_DM_CLICK = new Binary(
|
||||
"timelines.engagement.is_profile_clicked_and_profile_share_dm_click",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
// This derived label is the union of all binary features above
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED = new Binary(
|
||||
"timelines.engagement.is_profile_clicked_and_profile_engaged",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
|
||||
// define bad profile click by considering following engagements (user report, tweet report, mute, block, etc) at profile page
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_USER_REPORT_CLICK = new Binary(
|
||||
"timelines.engagement.is_profile_clicked_and_profile_user_report_click",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_TWEET_REPORT_CLICK = new Binary(
|
||||
"timelines.engagement.is_profile_clicked_and_profile_tweet_report_click",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_MUTE = new Binary(
|
||||
"timelines.engagement.is_profile_clicked_and_profile_mute",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_BLOCK = new Binary(
|
||||
"timelines.engagement.is_profile_clicked_and_profile_block",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
// This derived label is the union of bad profile click engagements and existing negative feedback
|
||||
val IS_NEGATIVE_FEEDBACK_V2 = new Binary(
|
||||
"timelines.engagement.is_negative_feedback_v2",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_NEGATIVE_FEEDBACK_UNION = new Binary(
|
||||
"timelines.engagement.is_negative_feedback_union",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
// don't like, mute or profile page -> mute
|
||||
val IS_WEAK_NEGATIVE_FEEDBACK = new Binary(
|
||||
"timelines.engagement.is_weak_negative_feedback",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
// report, block or profile page -> report, block
|
||||
val IS_STRONG_NEGATIVE_FEEDBACK = new Binary(
|
||||
"timelines.engagement.is_strong_negative_feedback",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
// engagement for following user from any surface area
|
||||
val IS_FOLLOWED_FROM_ANY_SURFACE_AREA = new Binary(
|
||||
"timelines.engagement.is_followed_from_any_surface_area",
|
||||
Set(EngagementsPublic, EngagementsPrivate).asJava)
|
||||
val IS_RELEVANCE_PROMPT_YES_CLICKED = new Binary(
|
||||
"timelines.engagement.is_relevance_prompt_yes_clicked",
|
||||
Set(EngagementsPublic, EngagementsPrivate).asJava)
|
||||
|
||||
// Reply downvote engagements
|
||||
val IS_REPLY_DOWNVOTED =
|
||||
new Binary("timelines.engagement.is_reply_downvoted", Set(EngagementsPrivate).asJava)
|
||||
val IS_REPLY_DOWNVOTE_REMOVED =
|
||||
new Binary("timelines.engagement.is_reply_downvote_removed", Set(EngagementsPrivate).asJava)
|
||||
|
||||
/**
|
||||
* Contains all engagements that are used/consumed by real-time
|
||||
* aggregates summingbird jobs. These engagements need to be
|
||||
* extractable from [[ClientEvent]].
|
||||
*/
|
||||
val EngagementsRealTime: Set[Feature[JBoolean]] = Set(
|
||||
IS_CLICKED,
|
||||
IS_DWELLED,
|
||||
IS_FAVORITED,
|
||||
IS_FOLLOWED,
|
||||
IS_OPEN_LINKED,
|
||||
IS_PHOTO_EXPANDED,
|
||||
IS_PROFILE_CLICKED,
|
||||
IS_QUOTED,
|
||||
IS_REPLIED,
|
||||
IS_RETWEETED,
|
||||
IS_RETWEETED_WITHOUT_QUOTE,
|
||||
IS_SHARE_DM_CLICKED,
|
||||
IS_SHARE_DM_SENT,
|
||||
IS_VIDEO_PLAYBACK_50,
|
||||
IS_VIDEO_VIEWED,
|
||||
IS_VIDEO_QUALITY_VIEWED
|
||||
)
|
||||
|
||||
val NegativeEngagementsRealTime: Set[Feature[JBoolean]] = Set(
|
||||
IS_REPORT_TWEET_CLICKED,
|
||||
IS_BLOCK_CLICKED,
|
||||
IS_MUTE_CLICKED
|
||||
)
|
||||
|
||||
val NegativeEngagementsRealTimeDontLike: Set[Feature[JBoolean]] = Set(
|
||||
IS_DONT_LIKE
|
||||
)
|
||||
|
||||
val NegativeEngagementsSecondary: Set[Feature[JBoolean]] = Set(
|
||||
IS_NOT_INTERESTED_IN_TOPIC,
|
||||
IS_NOT_ABOUT_TOPIC,
|
||||
IS_NOT_RECENT,
|
||||
IS_NOT_RELEVANT,
|
||||
IS_SEE_FEWER,
|
||||
IS_UNFOLLOW_TOPIC
|
||||
)
|
||||
|
||||
val PrivateEngagements: Set[Feature[JBoolean]] = Set(
|
||||
IS_CLICKED,
|
||||
IS_DWELLED,
|
||||
IS_OPEN_LINKED,
|
||||
IS_PHOTO_EXPANDED,
|
||||
IS_PROFILE_CLICKED,
|
||||
IS_QUOTED,
|
||||
IS_VIDEO_PLAYBACK_50,
|
||||
IS_VIDEO_QUALITY_VIEWED
|
||||
)
|
||||
|
||||
val ImpressedEngagements: Set[Feature[JBoolean]] = Set(
|
||||
IS_IMPRESSED
|
||||
)
|
||||
|
||||
val PrivateEngagementsV2: Set[Feature[JBoolean]] = Set(
|
||||
IS_CLICKED,
|
||||
IS_OPEN_LINKED,
|
||||
IS_PHOTO_EXPANDED,
|
||||
IS_PROFILE_CLICKED,
|
||||
IS_VIDEO_PLAYBACK_50,
|
||||
IS_VIDEO_QUALITY_VIEWED
|
||||
) ++ ImpressedEngagements
|
||||
|
||||
val CoreEngagements: Set[Feature[JBoolean]] = Set(
|
||||
IS_FAVORITED,
|
||||
IS_REPLIED,
|
||||
IS_RETWEETED
|
||||
)
|
||||
|
||||
val DwellEngagements: Set[Feature[JBoolean]] = Set(
|
||||
IS_DWELLED
|
||||
)
|
||||
|
||||
val PrivateCoreEngagements: Set[Feature[JBoolean]] = Set(
|
||||
IS_CLICKED,
|
||||
IS_OPEN_LINKED,
|
||||
IS_PHOTO_EXPANDED,
|
||||
IS_VIDEO_PLAYBACK_50,
|
||||
IS_VIDEO_QUALITY_VIEWED
|
||||
)
|
||||
|
||||
val ConditionalEngagements: Set[Feature[JBoolean]] = Set(
|
||||
IS_GOOD_CLICKED_CONVO_DESC_V1,
|
||||
IS_GOOD_CLICKED_CONVO_DESC_V2,
|
||||
IS_GOOD_CLICKED_WITH_DWELL_SUM_GTE_60S
|
||||
)
|
||||
|
||||
val ShareEngagements: Set[Feature[JBoolean]] = Set(
|
||||
IS_SHARED,
|
||||
IS_SHARE_MENU_CLICKED
|
||||
)
|
||||
|
||||
val BookmarkEngagements: Set[Feature[JBoolean]] = Set(
|
||||
IS_BOOKMARKED
|
||||
)
|
||||
|
||||
val TweetDetailDwellEngagements: Set[Feature[JBoolean]] = Set(
|
||||
IS_TWEET_DETAIL_DWELLED,
|
||||
IS_TWEET_DETAIL_DWELLED_8_SEC,
|
||||
IS_TWEET_DETAIL_DWELLED_15_SEC,
|
||||
IS_TWEET_DETAIL_DWELLED_25_SEC,
|
||||
IS_TWEET_DETAIL_DWELLED_30_SEC
|
||||
)
|
||||
|
||||
val ProfileDwellEngagements: Set[Feature[JBoolean]] = Set(
|
||||
IS_PROFILE_DWELLED,
|
||||
IS_PROFILE_DWELLED_10_SEC,
|
||||
IS_PROFILE_DWELLED_20_SEC,
|
||||
IS_PROFILE_DWELLED_30_SEC
|
||||
)
|
||||
|
||||
val FullscreenVideoDwellEngagements: Set[Feature[JBoolean]] = Set(
|
||||
IS_FULLSCREEN_VIDEO_DWELLED,
|
||||
IS_FULLSCREEN_VIDEO_DWELLED_5_SEC,
|
||||
IS_FULLSCREEN_VIDEO_DWELLED_10_SEC,
|
||||
IS_FULLSCREEN_VIDEO_DWELLED_20_SEC,
|
||||
IS_FULLSCREEN_VIDEO_DWELLED_30_SEC
|
||||
)
|
||||
|
||||
// Please do not add new engagements here until having estimated the impact
|
||||
// to capacity requirements. User-author real-time aggregates have a very
|
||||
// large key space.
|
||||
val UserAuthorEngagements: Set[Feature[JBoolean]] = CoreEngagements ++ DwellEngagements ++ Set(
|
||||
IS_CLICKED,
|
||||
IS_PROFILE_CLICKED,
|
||||
IS_PHOTO_EXPANDED,
|
||||
IS_VIDEO_PLAYBACK_50,
|
||||
IS_NEGATIVE_FEEDBACK_UNION
|
||||
)
|
||||
|
||||
val ImplicitPositiveEngagements: Set[Feature[JBoolean]] = Set(
|
||||
IS_CLICKED,
|
||||
IS_DWELLED,
|
||||
IS_OPEN_LINKED,
|
||||
IS_PROFILE_CLICKED,
|
||||
IS_QUOTED,
|
||||
IS_VIDEO_PLAYBACK_50,
|
||||
IS_VIDEO_QUALITY_VIEWED,
|
||||
IS_TWEET_DETAIL_DWELLED,
|
||||
IS_GOOD_CLICKED_CONVO_DESC_V1,
|
||||
IS_GOOD_CLICKED_CONVO_DESC_V2,
|
||||
IS_SHARED,
|
||||
IS_SHARE_MENU_CLICKED,
|
||||
IS_SHARE_DM_SENT,
|
||||
IS_SHARE_DM_CLICKED
|
||||
)
|
||||
|
||||
val ExplicitPositiveEngagements: Set[Feature[JBoolean]] = CoreEngagements ++ Set(
|
||||
IS_FOLLOWED,
|
||||
IS_QUOTED
|
||||
)
|
||||
|
||||
val AllNegativeEngagements: Set[Feature[JBoolean]] =
|
||||
NegativeEngagementsRealTime ++ NegativeEngagementsRealTimeDontLike ++ Set(
|
||||
IS_NOT_RECENT,
|
||||
IS_NOT_RELEVANT,
|
||||
IS_SEE_FEWER
|
||||
)
|
||||
}
|
|
@ -0,0 +1,97 @@
|
|||
package com.twitter.timelines.prediction.features.common
|
||||
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType._
|
||||
import com.twitter.ml.api.Feature
|
||||
import com.twitter.ml.api.Feature.Binary
|
||||
import java.lang.{Boolean => JBoolean}
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object ProfileLabelFeatures {
|
||||
private val prefix = "profile"
|
||||
|
||||
val IS_CLICKED =
|
||||
new Binary(s"${prefix}.engagement.is_clicked", Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_DWELLED =
|
||||
new Binary(s"${prefix}.engagement.is_dwelled", Set(TweetsViewed, EngagementsPrivate).asJava)
|
||||
val IS_FAVORITED = new Binary(
|
||||
s"${prefix}.engagement.is_favorited",
|
||||
Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_REPLIED = new Binary(
|
||||
s"${prefix}.engagement.is_replied",
|
||||
Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_RETWEETED = new Binary(
|
||||
s"${prefix}.engagement.is_retweeted",
|
||||
Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
|
||||
// Negative engagements
|
||||
val IS_DONT_LIKE =
|
||||
new Binary(s"${prefix}.engagement.is_dont_like", Set(EngagementsPrivate).asJava)
|
||||
val IS_BLOCK_CLICKED = new Binary(
|
||||
s"${prefix}.engagement.is_block_clicked",
|
||||
Set(Blocks, TweetsClicked, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_MUTE_CLICKED = new Binary(
|
||||
s"${prefix}.engagement.is_mute_clicked",
|
||||
Set(Mutes, TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_REPORT_TWEET_CLICKED = new Binary(
|
||||
s"${prefix}.engagement.is_report_tweet_clicked",
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_NEGATIVE_FEEDBACK_UNION = new Binary(
|
||||
s"${prefix}.engagement.is_negative_feedback_union",
|
||||
Set(EngagementsPrivate, Blocks, Mutes, TweetsClicked, EngagementsPublic).asJava)
|
||||
|
||||
val CoreEngagements: Set[Feature[JBoolean]] = Set(
|
||||
IS_CLICKED,
|
||||
IS_DWELLED,
|
||||
IS_FAVORITED,
|
||||
IS_REPLIED,
|
||||
IS_RETWEETED
|
||||
)
|
||||
|
||||
val NegativeEngagements: Set[Feature[JBoolean]] = Set(
|
||||
IS_DONT_LIKE,
|
||||
IS_BLOCK_CLICKED,
|
||||
IS_MUTE_CLICKED,
|
||||
IS_REPORT_TWEET_CLICKED
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
object SearchLabelFeatures {
|
||||
private val prefix = "search"
|
||||
|
||||
val IS_CLICKED =
|
||||
new Binary(s"${prefix}.engagement.is_clicked", Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_DWELLED =
|
||||
new Binary(s"${prefix}.engagement.is_dwelled", Set(TweetsViewed, EngagementsPrivate).asJava)
|
||||
val IS_FAVORITED = new Binary(
|
||||
s"${prefix}.engagement.is_favorited",
|
||||
Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_REPLIED = new Binary(
|
||||
s"${prefix}.engagement.is_replied",
|
||||
Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_RETWEETED = new Binary(
|
||||
s"${prefix}.engagement.is_retweeted",
|
||||
Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_PROFILE_CLICKED_SEARCH_RESULT_USER = new Binary(
|
||||
s"${prefix}.engagement.is_profile_clicked_search_result_user",
|
||||
Set(ProfilesClicked, ProfilesViewed, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_CLICKED_SEARCH_RESULT_TWEET = new Binary(
|
||||
s"${prefix}.engagement.is_profile_clicked_search_result_tweet",
|
||||
Set(ProfilesClicked, ProfilesViewed, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_CLICKED_TYPEAHEAD_USER = new Binary(
|
||||
s"${prefix}.engagement.is_profile_clicked_typeahead_user",
|
||||
Set(ProfilesClicked, ProfilesViewed, EngagementsPrivate).asJava)
|
||||
|
||||
val CoreEngagements: Set[Feature[JBoolean]] = Set(
|
||||
IS_CLICKED,
|
||||
IS_DWELLED,
|
||||
IS_FAVORITED,
|
||||
IS_REPLIED,
|
||||
IS_RETWEETED,
|
||||
IS_PROFILE_CLICKED_SEARCH_RESULT_USER,
|
||||
IS_PROFILE_CLICKED_SEARCH_RESULT_TWEET,
|
||||
IS_PROFILE_CLICKED_TYPEAHEAD_USER
|
||||
)
|
||||
}
|
||||
// Add Tweet Detail labels later
|
|
@ -0,0 +1,759 @@
|
|||
package com.twitter.timelines.prediction.features.common
|
||||
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType._
|
||||
import com.twitter.ml.api.Feature.Binary
|
||||
import com.twitter.ml.api.Feature.Continuous
|
||||
import com.twitter.ml.api.Feature.Discrete
|
||||
import com.twitter.ml.api.Feature.SparseBinary
|
||||
import com.twitter.ml.api.Feature.SparseContinuous
|
||||
import com.twitter.ml.api.Feature.Text
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object TimelinesSharedFeatures extends TimelinesSharedFeatures("")
|
||||
object InReplyToTweetTimelinesSharedFeatures extends TimelinesSharedFeatures("in_reply_to_tweet")
|
||||
|
||||
/**
|
||||
* Defines shared features
|
||||
*/
|
||||
class TimelinesSharedFeatures(prefix: String) {
|
||||
private def name(featureName: String): String = {
|
||||
if (prefix.nonEmpty) {
|
||||
s"$prefix.$featureName"
|
||||
} else {
|
||||
featureName
|
||||
}
|
||||
}
|
||||
|
||||
// meta
|
||||
val EXPERIMENT_META = new SparseBinary(
|
||||
name("timelines.meta.experiment_meta"),
|
||||
Set(ExperimentId, ExperimentName).asJava)
|
||||
|
||||
// historically used in the "combined models" to distinguish in-network and out of network tweets.
|
||||
// now the feature denotes which adapter (recap or rectweet) was used to generate the datarecords.
|
||||
// and is used by the data collection pipeline to split the training data.
|
||||
val INJECTION_TYPE = new Discrete(name("timelines.meta.injection_type"))
|
||||
|
||||
// Used to indicate which injection module is this
|
||||
val INJECTION_MODULE_NAME = new Text(name("timelines.meta.injection_module_name"))
|
||||
|
||||
val LIST_ID = new Discrete(name("timelines.meta.list_id"))
|
||||
val LIST_IS_PINNED = new Binary(name("timelines.meta.list_is_pinned"))
|
||||
|
||||
// internal id per each PS request. mainly to join back commomn features and candidate features later
|
||||
val PREDICTION_REQUEST_ID = new Discrete(name("timelines.meta.prediction_request_id"))
|
||||
// internal id per each TLM request. mainly to deduplicate re-served cached tweets in logging
|
||||
val SERVED_REQUEST_ID = new Discrete(name("timelines.meta.served_request_id"))
|
||||
// internal id used for join key in kafka logging, equal to servedRequestId if tweet is cached,
|
||||
// else equal to predictionRequestId
|
||||
val SERVED_ID = new Discrete(name("timelines.meta.served_id"))
|
||||
val REQUEST_JOIN_ID = new Discrete(name("timelines.meta.request_join_id"))
|
||||
|
||||
// Internal boolean flag per tweet, whether the tweet is served from RankedTweetsCache: TQ-14050
|
||||
// this feature should not be trained on, blacklisted in feature_config: D838346
|
||||
val IS_READ_FROM_CACHE = new Binary(name("timelines.meta.is_read_from_cache"))
|
||||
|
||||
// model score discounts
|
||||
val PHOTO_DISCOUNT = new Continuous(name("timelines.score_discounts.photo"))
|
||||
val VIDEO_DISCOUNT = new Continuous(name("timelines.score_discounts.video"))
|
||||
val TWEET_HEIGHT_DISCOUNT = new Continuous(name("timelines.score_discounts.tweet_height"))
|
||||
val TOXICITY_DISCOUNT = new Continuous(name("timelines.score_discounts.toxicity"))
|
||||
|
||||
// engagements
|
||||
val ENGAGEMENT_TYPE = new Discrete(name("timelines.engagement.type"))
|
||||
val PREDICTED_IS_FAVORITED =
|
||||
new Continuous(name("timelines.engagement_predicted.is_favorited"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_RETWEETED =
|
||||
new Continuous(name("timelines.engagement_predicted.is_retweeted"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_QUOTED =
|
||||
new Continuous(name("timelines.engagement_predicted.is_quoted"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_REPLIED =
|
||||
new Continuous(name("timelines.engagement_predicted.is_replied"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_OPEN_LINKED = new Continuous(
|
||||
name("timelines.engagement_predicted.is_open_linked"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_GOOD_OPEN_LINK = new Continuous(
|
||||
name("timelines.engagement_predicted.is_good_open_link"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_PROFILE_CLICKED = new Continuous(
|
||||
name("timelines.engagement_predicted.is_profile_clicked"),
|
||||
Set(EngagementScore).asJava
|
||||
)
|
||||
val PREDICTED_IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED = new Continuous(
|
||||
name("timelines.engagement_predicted.is_profile_clicked_and_profile_engaged"),
|
||||
Set(EngagementScore).asJava
|
||||
)
|
||||
val PREDICTED_IS_CLICKED =
|
||||
new Continuous(name("timelines.engagement_predicted.is_clicked"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_PHOTO_EXPANDED = new Continuous(
|
||||
name("timelines.engagement_predicted.is_photo_expanded"),
|
||||
Set(EngagementScore).asJava
|
||||
)
|
||||
val PREDICTED_IS_FOLLOWED =
|
||||
new Continuous(name("timelines.engagement_predicted.is_followed"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_DONT_LIKE =
|
||||
new Continuous(name("timelines.engagement_predicted.is_dont_like"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_VIDEO_PLAYBACK_50 = new Continuous(
|
||||
name("timelines.engagement_predicted.is_video_playback_50"),
|
||||
Set(EngagementScore).asJava
|
||||
)
|
||||
val PREDICTED_IS_VIDEO_QUALITY_VIEWED = new Continuous(
|
||||
name("timelines.engagement_predicted.is_video_quality_viewed"),
|
||||
Set(EngagementScore).asJava
|
||||
)
|
||||
val PREDICTED_IS_GOOD_CLICKED_V1 = new Continuous(
|
||||
name("timelines.engagement_predicted.is_good_clicked_convo_desc_favorited_or_replied"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_GOOD_CLICKED_V2 = new Continuous(
|
||||
name("timelines.engagement_predicted.is_good_clicked_convo_desc_v2"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_TWEET_DETAIL_DWELLED_8_SEC = new Continuous(
|
||||
name("timelines.engagement_predicted.is_tweet_detail_dwelled_8_sec"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_TWEET_DETAIL_DWELLED_15_SEC = new Continuous(
|
||||
name("timelines.engagement_predicted.is_tweet_detail_dwelled_15_sec"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_TWEET_DETAIL_DWELLED_25_SEC = new Continuous(
|
||||
name("timelines.engagement_predicted.is_tweet_detail_dwelled_25_sec"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_TWEET_DETAIL_DWELLED_30_SEC = new Continuous(
|
||||
name("timelines.engagement_predicted.is_tweet_detail_dwelled_30_sec"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_GOOD_CLICKED_WITH_DWELL_SUM_GTE_60S = new Continuous(
|
||||
name(
|
||||
"timelines.engagement_predicted.is_good_clicked_convo_desc_favorited_or_replied_or_dwell_sum_gte_60_secs"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_FAVORITED_FAV_ENGAGED_BY_AUTHOR = new Continuous(
|
||||
name("timelines.engagement_predicted.is_favorited_fav_engaged_by_author"),
|
||||
Set(EngagementScore).asJava)
|
||||
|
||||
val PREDICTED_IS_REPORT_TWEET_CLICKED =
|
||||
new Continuous(
|
||||
name("timelines.engagement_predicted.is_report_tweet_clicked"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_NEGATIVE_FEEDBACK = new Continuous(
|
||||
name("timelines.engagement_predicted.is_negative_feedback"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_NEGATIVE_FEEDBACK_V2 = new Continuous(
|
||||
name("timelines.engagement_predicted.is_negative_feedback_v2"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_WEAK_NEGATIVE_FEEDBACK = new Continuous(
|
||||
name("timelines.engagement_predicted.is_weak_negative_feedback"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_STRONG_NEGATIVE_FEEDBACK = new Continuous(
|
||||
name("timelines.engagement_predicted.is_strong_negative_feedback"),
|
||||
Set(EngagementScore).asJava)
|
||||
|
||||
val PREDICTED_IS_DWELLED_IN_BOUNDS_V1 = new Continuous(
|
||||
name("timelines.engagement_predicted.is_dwelled_in_bounds_v1"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_DWELL_NORMALIZED_OVERALL = new Continuous(
|
||||
name("timelines.engagement_predicted.dwell_normalized_overall"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_DWELL_CDF =
|
||||
new Continuous(name("timelines.engagement_predicted.dwell_cdf"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_DWELL_CDF_OVERALL = new Continuous(
|
||||
name("timelines.engagement_predicted.dwell_cdf_overall"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_DWELLED =
|
||||
new Continuous(name("timelines.engagement_predicted.is_dwelled"), Set(EngagementScore).asJava)
|
||||
|
||||
val PREDICTED_IS_HOME_LATEST_VISITED = new Continuous(
|
||||
name("timelines.engagement_predicted.is_home_latest_visited"),
|
||||
Set(EngagementScore).asJava)
|
||||
|
||||
val PREDICTED_IS_BOOKMARKED = new Continuous(
|
||||
name("timelines.engagement_predicted.is_bookmarked"),
|
||||
Set(EngagementScore).asJava)
|
||||
|
||||
val PREDICTED_IS_SHARED =
|
||||
new Continuous(name("timelines.engagement_predicted.is_shared"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_SHARE_MENU_CLICKED = new Continuous(
|
||||
name("timelines.engagement_predicted.is_share_menu_clicked"),
|
||||
Set(EngagementScore).asJava)
|
||||
|
||||
val PREDICTED_IS_PROFILE_DWELLED_20_SEC = new Continuous(
|
||||
name("timelines.engagement_predicted.is_profile_dwelled_20_sec"),
|
||||
Set(EngagementScore).asJava)
|
||||
|
||||
val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_5_SEC = new Continuous(
|
||||
name("timelines.engagement_predicted.is_fullscreen_video_dwelled_5_sec"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_10_SEC = new Continuous(
|
||||
name("timelines.engagement_predicted.is_fullscreen_video_dwelled_10_sec"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_20_SEC = new Continuous(
|
||||
name("timelines.engagement_predicted.is_fullscreen_video_dwelled_20_sec"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_30_SEC = new Continuous(
|
||||
name("timelines.engagement_predicted.is_fullscreen_video_dwelled_30_sec"),
|
||||
Set(EngagementScore).asJava)
|
||||
|
||||
// Please use this timestamp, not the `meta.timestamp`, for the actual served timestamp.
|
||||
val SERVED_TIMESTAMP =
|
||||
new Discrete("timelines.meta.timestamp.served", Set(PrivateTimestamp).asJava)
|
||||
|
||||
// timestamp when the engagement has occurred. do not train on these features
|
||||
val TIMESTAMP_FAVORITED =
|
||||
new Discrete("timelines.meta.timestamp.engagement.favorited", Set(PublicTimestamp).asJava)
|
||||
val TIMESTAMP_RETWEETED =
|
||||
new Discrete("timelines.meta.timestamp.engagement.retweeted", Set(PublicTimestamp).asJava)
|
||||
val TIMESTAMP_REPLIED =
|
||||
new Discrete("timelines.meta.timestamp.engagement.replied", Set(PublicTimestamp).asJava)
|
||||
val TIMESTAMP_PROFILE_CLICKED = new Discrete(
|
||||
"timelines.meta.timestamp.engagement.profile_clicked",
|
||||
Set(PrivateTimestamp).asJava)
|
||||
val TIMESTAMP_CLICKED =
|
||||
new Discrete("timelines.meta.timestamp.engagement.clicked", Set(PrivateTimestamp).asJava)
|
||||
val TIMESTAMP_PHOTO_EXPANDED =
|
||||
new Discrete("timelines.meta.timestamp.engagement.photo_expanded", Set(PrivateTimestamp).asJava)
|
||||
val TIMESTAMP_DWELLED =
|
||||
new Discrete("timelines.meta.timestamp.engagement.dwelled", Set(PrivateTimestamp).asJava)
|
||||
val TIMESTAMP_VIDEO_PLAYBACK_50 = new Discrete(
|
||||
"timelines.meta.timestamp.engagement.video_playback_50",
|
||||
Set(PrivateTimestamp).asJava)
|
||||
// reply engaged by author
|
||||
val TIMESTAMP_REPLY_FAVORITED_BY_AUTHOR = new Discrete(
|
||||
"timelines.meta.timestamp.engagement.reply_favorited_by_author",
|
||||
Set(PublicTimestamp).asJava)
|
||||
val TIMESTAMP_REPLY_REPLIED_BY_AUTHOR = new Discrete(
|
||||
"timelines.meta.timestamp.engagement.reply_replied_by_author",
|
||||
Set(PublicTimestamp).asJava)
|
||||
val TIMESTAMP_REPLY_RETWEETED_BY_AUTHOR = new Discrete(
|
||||
"timelines.meta.timestamp.engagement.reply_retweeted_by_author",
|
||||
Set(PublicTimestamp).asJava)
|
||||
// fav engaged by author
|
||||
val TIMESTAMP_FAV_FAVORITED_BY_AUTHOR = new Discrete(
|
||||
"timelines.meta.timestamp.engagement.fav_favorited_by_author",
|
||||
Set(PublicTimestamp).asJava)
|
||||
val TIMESTAMP_FAV_REPLIED_BY_AUTHOR = new Discrete(
|
||||
"timelines.meta.timestamp.engagement.fav_replied_by_author",
|
||||
Set(PublicTimestamp).asJava)
|
||||
val TIMESTAMP_FAV_RETWEETED_BY_AUTHOR = new Discrete(
|
||||
"timelines.meta.timestamp.engagement.fav_retweeted_by_author",
|
||||
Set(PublicTimestamp).asJava)
|
||||
val TIMESTAMP_FAV_FOLLOWED_BY_AUTHOR = new Discrete(
|
||||
"timelines.meta.timestamp.engagement.fav_followed_by_author",
|
||||
Set(PublicTimestamp).asJava)
|
||||
// good click
|
||||
val TIMESTAMP_GOOD_CLICK_CONVO_DESC_FAVORITED = new Discrete(
|
||||
"timelines.meta.timestamp.engagement.good_click_convo_desc_favorited",
|
||||
Set(PrivateTimestamp).asJava)
|
||||
val TIMESTAMP_GOOD_CLICK_CONVO_DESC_REPLIIED = new Discrete(
|
||||
"timelines.meta.timestamp.engagement.good_click_convo_desc_replied",
|
||||
Set(PrivateTimestamp).asJava)
|
||||
val TIMESTAMP_GOOD_CLICK_CONVO_DESC_PROFILE_CLICKED = new Discrete(
|
||||
"timelines.meta.timestamp.engagement.good_click_convo_desc_profiile_clicked",
|
||||
Set(PrivateTimestamp).asJava)
|
||||
val TIMESTAMP_NEGATIVE_FEEDBACK = new Discrete(
|
||||
"timelines.meta.timestamp.engagement.negative_feedback",
|
||||
Set(PrivateTimestamp).asJava)
|
||||
val TIMESTAMP_REPORT_TWEET_CLICK =
|
||||
new Discrete(
|
||||
"timelines.meta.timestamp.engagement.report_tweet_click",
|
||||
Set(PrivateTimestamp).asJava)
|
||||
val TIMESTAMP_IMPRESSED =
|
||||
new Discrete("timelines.meta.timestamp.engagement.impressed", Set(PublicTimestamp).asJava)
|
||||
val TIMESTAMP_TWEET_DETAIL_DWELLED =
|
||||
new Discrete(
|
||||
"timelines.meta.timestamp.engagement.tweet_detail_dwelled",
|
||||
Set(PublicTimestamp).asJava)
|
||||
val TIMESTAMP_PROFILE_DWELLED =
|
||||
new Discrete("timelines.meta.timestamp.engagement.profile_dwelled", Set(PublicTimestamp).asJava)
|
||||
val TIMESTAMP_FULLSCREEN_VIDEO_DWELLED =
|
||||
new Discrete(
|
||||
"timelines.meta.timestamp.engagement.fullscreen_video_dwelled",
|
||||
Set(PublicTimestamp).asJava)
|
||||
val TIMESTAMP_LINK_DWELLED =
|
||||
new Discrete("timelines.meta.timestamp.engagement.link_dwelled", Set(PublicTimestamp).asJava)
|
||||
|
||||
// these are used to dup and split the negative instances during streaming processing (kafka)
|
||||
val TRAINING_FOR_FAVORITED =
|
||||
new Binary("timelines.meta.training_data.for_favorited", Set(EngagementId).asJava)
|
||||
val TRAINING_FOR_RETWEETED =
|
||||
new Binary("timelines.meta.training_data.for_retweeted", Set(EngagementId).asJava)
|
||||
val TRAINING_FOR_REPLIED =
|
||||
new Binary("timelines.meta.training_data.for_replied", Set(EngagementId).asJava)
|
||||
val TRAINING_FOR_PROFILE_CLICKED =
|
||||
new Binary("timelines.meta.training_data.for_profile_clicked", Set(EngagementId).asJava)
|
||||
val TRAINING_FOR_CLICKED =
|
||||
new Binary("timelines.meta.training_data.for_clicked", Set(EngagementId).asJava)
|
||||
val TRAINING_FOR_PHOTO_EXPANDED =
|
||||
new Binary("timelines.meta.training_data.for_photo_expanded", Set(EngagementId).asJava)
|
||||
val TRAINING_FOR_VIDEO_PLAYBACK_50 =
|
||||
new Binary("timelines.meta.training_data.for_video_playback_50", Set(EngagementId).asJava)
|
||||
val TRAINING_FOR_NEGATIVE_FEEDBACK =
|
||||
new Binary("timelines.meta.training_data.for_negative_feedback", Set(EngagementId).asJava)
|
||||
val TRAINING_FOR_REPORTED =
|
||||
new Binary("timelines.meta.training_data.for_reported", Set(EngagementId).asJava)
|
||||
val TRAINING_FOR_DWELLED =
|
||||
new Binary("timelines.meta.training_data.for_dwelled", Set(EngagementId).asJava)
|
||||
val TRAINING_FOR_SHARED =
|
||||
new Binary("timelines.meta.training_data.for_shared", Set(EngagementId).asJava)
|
||||
val TRAINING_FOR_SHARE_MENU_CLICKED =
|
||||
new Binary("timelines.meta.training_data.for_share_menu_clicked", Set(EngagementId).asJava)
|
||||
|
||||
// Warning: do not train on these features
|
||||
val PREDICTED_SCORE = new Continuous(name("timelines.score"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_FAV = new Continuous(name("timelines.score.fav"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_RETWEET =
|
||||
new Continuous(name("timelines.score.retweet"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_REPLY =
|
||||
new Continuous(name("timelines.score.reply"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_OPEN_LINK =
|
||||
new Continuous(name("timelines.score.open_link"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_GOOD_OPEN_LINK =
|
||||
new Continuous(name("timelines.score.good_open_link"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_PROFILE_CLICK =
|
||||
new Continuous(name("timelines.score.profile_click"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_DETAIL_EXPAND =
|
||||
new Continuous(name("timelines.score.detail_expand"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_PHOTO_EXPAND =
|
||||
new Continuous(name("timelines.score.photo_expand"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_PLAYBACK_50 =
|
||||
new Continuous(name("timelines.score.playback_50"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_VIDEO_QUALITY_VIEW =
|
||||
new Continuous(name("timelines.score.video_quality_view"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_DONT_LIKE =
|
||||
new Continuous(name("timelines.score.dont_like"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_PROFILE_CLICKED_AND_PROFILE_ENGAGED =
|
||||
new Continuous(
|
||||
name("timelines.score.profile_clicked_and_profile_engaged"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_GOOD_CLICKED_V1 =
|
||||
new Continuous(name("timelines.score.good_clicked_v1"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_GOOD_CLICKED_V2 =
|
||||
new Continuous(name("timelines.score.good_clicked_v2"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_DWELL =
|
||||
new Continuous(name("timelines.score.dwell"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_DWELL_CDF =
|
||||
new Continuous(name("timelines.score.dwell_cfd"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_DWELL_CDF_OVERALL =
|
||||
new Continuous(name("timelines.score.dwell_cfd_overall"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_DWELL_NORMALIZED_OVERALL =
|
||||
new Continuous(name("timelines.score.dwell_normalized_overall"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_NEGATIVE_FEEDBACK =
|
||||
new Continuous(name("timelines.score.negative_feedback"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_NEGATIVE_FEEDBACK_V2 =
|
||||
new Continuous(name("timelines.score.negative_feedback_v2"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_WEAK_NEGATIVE_FEEDBACK =
|
||||
new Continuous(name("timelines.score.weak_negative_feedback"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_STRONG_NEGATIVE_FEEDBACK =
|
||||
new Continuous(name("timelines.score.strong_negative_feedback"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_REPORT_TWEET_CLICKED =
|
||||
new Continuous(name("timelines.score.report_tweet_clicked"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_UNFOLLOW_TOPIC =
|
||||
new Continuous(name("timelines.score.unfollow_topic"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_FOLLOW =
|
||||
new Continuous(name("timelines.score.follow"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_RELEVANCE_PROMPT_YES_CLICKED =
|
||||
new Continuous(
|
||||
name("timelines.score.relevance_prompt_yes_clicked"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_BOOKMARK =
|
||||
new Continuous(name("timelines.score.bookmark"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_SHARE =
|
||||
new Continuous(name("timelines.score.share"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_SHARE_MENU_CLICK =
|
||||
new Continuous(name("timelines.score.share_menu_click"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_PROFILE_DWELLED =
|
||||
new Continuous(name("timelines.score.good_profile_dwelled"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_TWEET_DETAIL_DWELLED =
|
||||
new Continuous(name("timelines.score.tweet_detail_dwelled"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_SCORE_FULLSCREEN_VIDEO_DWELL =
|
||||
new Continuous(name("timelines.score.fullscreen_video_dwell"), Set(EngagementScore).asJava)
|
||||
|
||||
// hydrated in TimelinesSharedFeaturesAdapter that recap adapter calls
|
||||
val ORIGINAL_AUTHOR_ID = new Discrete(name("entities.original_author_id"), Set(UserId).asJava)
|
||||
val SOURCE_AUTHOR_ID = new Discrete(name("entities.source_author_id"), Set(UserId).asJava)
|
||||
val SOURCE_TWEET_ID = new Discrete(name("entities.source_tweet_id"), Set(TweetId).asJava)
|
||||
val TOPIC_ID = new Discrete(name("entities.topic_id"), Set(SemanticcoreClassification).asJava)
|
||||
val INFERRED_TOPIC_IDS =
|
||||
new SparseBinary(name("entities.inferred_topic_ids"), Set(SemanticcoreClassification).asJava)
|
||||
val INFERRED_TOPIC_ID = TypedAggregateGroup.sparseFeature(INFERRED_TOPIC_IDS)
|
||||
|
||||
val WEIGHTED_FAV_COUNT = new Continuous(
|
||||
name("timelines.earlybird.weighted_fav_count"),
|
||||
Set(CountOfPrivateLikes, CountOfPublicLikes).asJava)
|
||||
val WEIGHTED_RETWEET_COUNT = new Continuous(
|
||||
name("timelines.earlybird.weighted_retweet_count"),
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava)
|
||||
val WEIGHTED_REPLY_COUNT = new Continuous(
|
||||
name("timelines.earlybird.weighted_reply_count"),
|
||||
Set(CountOfPrivateReplies, CountOfPublicReplies).asJava)
|
||||
val WEIGHTED_QUOTE_COUNT = new Continuous(
|
||||
name("timelines.earlybird.weighted_quote_count"),
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava)
|
||||
val EMBEDS_IMPRESSION_COUNT_V2 = new Continuous(
|
||||
name("timelines.earlybird.embeds_impression_count_v2"),
|
||||
Set(CountOfImpression).asJava)
|
||||
val EMBEDS_URL_COUNT_V2 = new Continuous(
|
||||
name("timelines.earlybird.embeds_url_count_v2"),
|
||||
Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava)
|
||||
val DECAYED_FAVORITE_COUNT = new Continuous(
|
||||
name("timelines.earlybird.decayed_favorite_count"),
|
||||
Set(CountOfPrivateLikes, CountOfPublicLikes).asJava)
|
||||
val DECAYED_RETWEET_COUNT = new Continuous(
|
||||
name("timelines.earlybird.decayed_retweet_count"),
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava)
|
||||
val DECAYED_REPLY_COUNT = new Continuous(
|
||||
name("timelines.earlybird.decayed_reply_count"),
|
||||
Set(CountOfPrivateReplies, CountOfPublicReplies).asJava)
|
||||
val DECAYED_QUOTE_COUNT = new Continuous(
|
||||
name("timelines.earlybird.decayed_quote_count"),
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava)
|
||||
val FAKE_FAVORITE_COUNT = new Continuous(
|
||||
name("timelines.earlybird.fake_favorite_count"),
|
||||
Set(CountOfPrivateLikes, CountOfPublicLikes).asJava)
|
||||
val FAKE_RETWEET_COUNT = new Continuous(
|
||||
name("timelines.earlybird.fake_retweet_count"),
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava)
|
||||
val FAKE_REPLY_COUNT = new Continuous(
|
||||
name("timelines.earlybird.fake_reply_count"),
|
||||
Set(CountOfPrivateReplies, CountOfPublicReplies).asJava)
|
||||
val FAKE_QUOTE_COUNT = new Continuous(
|
||||
name("timelines.earlybird.fake_quote_count"),
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava)
|
||||
val QUOTE_COUNT = new Continuous(
|
||||
name("timelines.earlybird.quote_count"),
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava)
|
||||
|
||||
// Safety features
|
||||
val LABEL_ABUSIVE_FLAG =
|
||||
new Binary(name("timelines.earlybird.label_abusive_flag"), Set(TweetSafetyLabels).asJava)
|
||||
val LABEL_ABUSIVE_HI_RCL_FLAG =
|
||||
new Binary(name("timelines.earlybird.label_abusive_hi_rcl_flag"), Set(TweetSafetyLabels).asJava)
|
||||
val LABEL_DUP_CONTENT_FLAG =
|
||||
new Binary(name("timelines.earlybird.label_dup_content_flag"), Set(TweetSafetyLabels).asJava)
|
||||
val LABEL_NSFW_HI_PRC_FLAG =
|
||||
new Binary(name("timelines.earlybird.label_nsfw_hi_prc_flag"), Set(TweetSafetyLabels).asJava)
|
||||
val LABEL_NSFW_HI_RCL_FLAG =
|
||||
new Binary(name("timelines.earlybird.label_nsfw_hi_rcl_flag"), Set(TweetSafetyLabels).asJava)
|
||||
val LABEL_SPAM_FLAG =
|
||||
new Binary(name("timelines.earlybird.label_spam_flag"), Set(TweetSafetyLabels).asJava)
|
||||
val LABEL_SPAM_HI_RCL_FLAG =
|
||||
new Binary(name("timelines.earlybird.label_spam_hi_rcl_flag"), Set(TweetSafetyLabels).asJava)
|
||||
|
||||
// Periscope features
|
||||
val PERISCOPE_EXISTS = new Binary(
|
||||
name("timelines.earlybird.periscope_exists"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val PERISCOPE_IS_LIVE = new Binary(
|
||||
name("timelines.earlybird.periscope_is_live"),
|
||||
Set(PrivateBroadcastMetrics, PublicBroadcastMetrics).asJava)
|
||||
val PERISCOPE_HAS_BEEN_FEATURED = new Binary(
|
||||
name("timelines.earlybird.periscope_has_been_featured"),
|
||||
Set(PrivateBroadcastMetrics, PublicBroadcastMetrics).asJava)
|
||||
val PERISCOPE_IS_CURRENTLY_FEATURED = new Binary(
|
||||
name("timelines.earlybird.periscope_is_currently_featured"),
|
||||
Set(PrivateBroadcastMetrics, PublicBroadcastMetrics).asJava
|
||||
)
|
||||
val PERISCOPE_IS_FROM_QUALITY_SOURCE = new Binary(
|
||||
name("timelines.earlybird.periscope_is_from_quality_source"),
|
||||
Set(PrivateBroadcastMetrics, PublicBroadcastMetrics).asJava
|
||||
)
|
||||
|
||||
val VISIBLE_TOKEN_RATIO = new Continuous(name("timelines.earlybird.visible_token_ratio"))
|
||||
val HAS_QUOTE = new Binary(
|
||||
name("timelines.earlybird.has_quote"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val IS_COMPOSER_SOURCE_CAMERA = new Binary(
|
||||
name("timelines.earlybird.is_composer_source_camera"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
|
||||
val EARLYBIRD_SCORE = new Continuous(
|
||||
name("timelines.earlybird_score"),
|
||||
Set(EngagementScore).asJava
|
||||
) // separating from the rest of "timelines.earlybird." namespace
|
||||
|
||||
val DWELL_TIME_MS = new Continuous(
|
||||
name("timelines.engagement.dwell_time_ms"),
|
||||
Set(EngagementDurationAndTimestamp, ImpressionMetadata, PrivateTimestamp).asJava)
|
||||
|
||||
val TWEET_DETAIL_DWELL_TIME_MS = new Continuous(
|
||||
name("timelines.engagement.tweet_detail_dwell_time_ms"),
|
||||
Set(EngagementDurationAndTimestamp, ImpressionMetadata, PrivateTimestamp).asJava)
|
||||
|
||||
val PROFILE_DWELL_TIME_MS = new Continuous(
|
||||
name("timelines.engagement.profile_dwell_time_ms"),
|
||||
Set(EngagementDurationAndTimestamp, ImpressionMetadata, PrivateTimestamp).asJava)
|
||||
|
||||
val FULLSCREEN_VIDEO_DWELL_TIME_MS = new Continuous(
|
||||
name("timelines.engagement.fullscreen_video_dwell_time_ms"),
|
||||
Set(EngagementDurationAndTimestamp, ImpressionMetadata, PrivateTimestamp).asJava)
|
||||
|
||||
val LINK_DWELL_TIME_MS = new Continuous(
|
||||
name("timelines.engagement.link_dwell_time_ms"),
|
||||
Set(EngagementDurationAndTimestamp, ImpressionMetadata, PrivateTimestamp).asJava)
|
||||
|
||||
val ASPECT_RATIO_DEN = new Continuous(
|
||||
name("tweetsource.tweet.media.aspect_ratio_den"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val ASPECT_RATIO_NUM = new Continuous(
|
||||
name("tweetsource.tweet.media.aspect_ratio_num"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val BIT_RATE = new Continuous(
|
||||
name("tweetsource.tweet.media.bit_rate"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val HEIGHT_2 = new Continuous(
|
||||
name("tweetsource.tweet.media.height_2"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val HEIGHT_1 = new Continuous(
|
||||
name("tweetsource.tweet.media.height_1"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val HEIGHT_3 = new Continuous(
|
||||
name("tweetsource.tweet.media.height_3"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val HEIGHT_4 = new Continuous(
|
||||
name("tweetsource.tweet.media.height_4"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val RESIZE_METHOD_1 = new Discrete(
|
||||
name("tweetsource.tweet.media.resize_method_1"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val RESIZE_METHOD_2 = new Discrete(
|
||||
name("tweetsource.tweet.media.resize_method_2"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val RESIZE_METHOD_3 = new Discrete(
|
||||
name("tweetsource.tweet.media.resize_method_3"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val RESIZE_METHOD_4 = new Discrete(
|
||||
name("tweetsource.tweet.media.resize_method_4"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val VIDEO_DURATION = new Continuous(
|
||||
name("tweetsource.tweet.media.video_duration"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val WIDTH_1 = new Continuous(
|
||||
name("tweetsource.tweet.media.width_1"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val WIDTH_2 = new Continuous(
|
||||
name("tweetsource.tweet.media.width_2"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val WIDTH_3 = new Continuous(
|
||||
name("tweetsource.tweet.media.width_3"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val WIDTH_4 = new Continuous(
|
||||
name("tweetsource.tweet.media.width_4"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val NUM_MEDIA_TAGS = new Continuous(
|
||||
name("tweetsource.tweet.media.num_tags"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val MEDIA_TAG_SCREEN_NAMES = new SparseBinary(
|
||||
name("tweetsource.tweet.media.tag_screen_names"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val STICKER_IDS = new SparseBinary(
|
||||
name("tweetsource.tweet.media.sticker_ids"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
|
||||
val NUM_COLOR_PALLETTE_ITEMS = new Continuous(
|
||||
name("tweetsource.v2.tweet.media.num_color_pallette_items"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val COLOR_1_RED = new Continuous(
|
||||
name("tweetsource.v2.tweet.media.color_1_red"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val COLOR_1_BLUE = new Continuous(
|
||||
name("tweetsource.v2.tweet.media.color_1_blue"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val COLOR_1_GREEN = new Continuous(
|
||||
name("tweetsource.v2.tweet.media.color_1_green"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val COLOR_1_PERCENTAGE = new Continuous(
|
||||
name("tweetsource.v2.tweet.media.color_1_percentage"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val MEDIA_PROVIDERS = new SparseBinary(
|
||||
name("tweetsource.v2.tweet.media.providers"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val IS_360 = new Binary(
|
||||
name("tweetsource.v2.tweet.media.is_360"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val VIEW_COUNT =
|
||||
new Continuous(name("tweetsource.v2.tweet.media.view_count"), Set(MediaContentMetrics).asJava)
|
||||
val IS_MANAGED = new Binary(
|
||||
name("tweetsource.v2.tweet.media.is_managed"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val IS_MONETIZABLE = new Binary(
|
||||
name("tweetsource.v2.tweet.media.is_monetizable"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val IS_EMBEDDABLE = new Binary(
|
||||
name("tweetsource.v2.tweet.media.is_embeddable"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val CLASSIFICATION_LABELS = new SparseContinuous(
|
||||
name("tweetsource.v2.tweet.media.classification_labels"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
|
||||
val NUM_STICKERS = new Continuous(
|
||||
name("tweetsource.v2.tweet.media.num_stickers"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val NUM_FACES = new Continuous(
|
||||
name("tweetsource.v2.tweet.media.num_faces"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val FACE_AREAS = new Continuous(
|
||||
name("tweetsource.v2.tweet.media.face_areas"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val HAS_SELECTED_PREVIEW_IMAGE = new Binary(
|
||||
name("tweetsource.v2.tweet.media.has_selected_preview_image"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val HAS_TITLE = new Binary(
|
||||
name("tweetsource.v2.tweet.media.has_title"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val HAS_DESCRIPTION = new Binary(
|
||||
name("tweetsource.v2.tweet.media.has_description"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val HAS_VISIT_SITE_CALL_TO_ACTION = new Binary(
|
||||
name("tweetsource.v2.tweet.media.has_visit_site_call_to_action"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val HAS_APP_INSTALL_CALL_TO_ACTION = new Binary(
|
||||
name("tweetsource.v2.tweet.media.has_app_install_call_to_action"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
val HAS_WATCH_NOW_CALL_TO_ACTION = new Binary(
|
||||
name("tweetsource.v2.tweet.media.has_watch_now_call_to_action"),
|
||||
Set(MediaFile, MediaProcessingInformation).asJava)
|
||||
|
||||
val NUM_CAPS =
|
||||
new Continuous(name("tweetsource.tweet.text.num_caps"), Set(PublicTweets, PrivateTweets).asJava)
|
||||
val TWEET_LENGTH =
|
||||
new Continuous(name("tweetsource.tweet.text.length"), Set(PublicTweets, PrivateTweets).asJava)
|
||||
val TWEET_LENGTH_TYPE = new Discrete(
|
||||
name("tweetsource.tweet.text.length_type"),
|
||||
Set(PublicTweets, PrivateTweets).asJava)
|
||||
val NUM_WHITESPACES = new Continuous(
|
||||
name("tweetsource.tweet.text.num_whitespaces"),
|
||||
Set(PublicTweets, PrivateTweets).asJava)
|
||||
val HAS_QUESTION =
|
||||
new Binary(name("tweetsource.tweet.text.has_question"), Set(PublicTweets, PrivateTweets).asJava)
|
||||
val NUM_NEWLINES = new Continuous(
|
||||
name("tweetsource.tweet.text.num_newlines"),
|
||||
Set(PublicTweets, PrivateTweets).asJava)
|
||||
val EMOJI_TOKENS = new SparseBinary(
|
||||
name("tweetsource.v3.tweet.text.emoji_tokens"),
|
||||
Set(PublicTweets, PrivateTweets).asJava)
|
||||
val EMOTICON_TOKENS = new SparseBinary(
|
||||
name("tweetsource.v3.tweet.text.emoticon_tokens"),
|
||||
Set(PublicTweets, PrivateTweets).asJava)
|
||||
val NUM_EMOJIS = new Continuous(
|
||||
name("tweetsource.v3.tweet.text.num_emojis"),
|
||||
Set(PublicTweets, PrivateTweets).asJava)
|
||||
val NUM_EMOTICONS = new Continuous(
|
||||
name("tweetsource.v3.tweet.text.num_emoticons"),
|
||||
Set(PublicTweets, PrivateTweets).asJava)
|
||||
val POS_UNIGRAMS = new SparseBinary(
|
||||
name("tweetsource.v3.tweet.text.pos_unigrams"),
|
||||
Set(PublicTweets, PrivateTweets).asJava)
|
||||
val POS_BIGRAMS = new SparseBinary(
|
||||
name("tweetsource.v3.tweet.text.pos_bigrams"),
|
||||
Set(PublicTweets, PrivateTweets).asJava)
|
||||
val TEXT_TOKENS = new SparseBinary(
|
||||
name("tweetsource.v4.tweet.text.tokens"),
|
||||
Set(PublicTweets, PrivateTweets).asJava)
|
||||
|
||||
// Health features model scores (see go/toxicity, go/pblock, go/pspammytweet)
|
||||
val PBLOCK_SCORE =
|
||||
new Continuous(name("timelines.earlybird.pblock_score"), Set(TweetSafetyScores).asJava)
|
||||
val TOXICITY_SCORE =
|
||||
new Continuous(name("timelines.earlybird.toxicity_score"), Set(TweetSafetyScores).asJava)
|
||||
val EXPERIMENTAL_HEALTH_MODEL_SCORE_1 =
|
||||
new Continuous(
|
||||
name("timelines.earlybird.experimental_health_model_score_1"),
|
||||
Set(TweetSafetyScores).asJava)
|
||||
val EXPERIMENTAL_HEALTH_MODEL_SCORE_2 =
|
||||
new Continuous(
|
||||
name("timelines.earlybird.experimental_health_model_score_2"),
|
||||
Set(TweetSafetyScores).asJava)
|
||||
val EXPERIMENTAL_HEALTH_MODEL_SCORE_3 =
|
||||
new Continuous(
|
||||
name("timelines.earlybird.experimental_health_model_score_3"),
|
||||
Set(TweetSafetyScores).asJava)
|
||||
val EXPERIMENTAL_HEALTH_MODEL_SCORE_4 =
|
||||
new Continuous(
|
||||
name("timelines.earlybird.experimental_health_model_score_4"),
|
||||
Set(TweetSafetyScores).asJava)
|
||||
val PSPAMMY_TWEET_SCORE =
|
||||
new Continuous(name("timelines.earlybird.pspammy_tweet_score"), Set(TweetSafetyScores).asJava)
|
||||
val PREPORTED_TWEET_SCORE =
|
||||
new Continuous(name("timelines.earlybird.preported_tweet_score"), Set(TweetSafetyScores).asJava)
|
||||
|
||||
// where record was displayed e.g. recap vs ranked timeline vs recycled
|
||||
// (do NOT use for training in prediction, since this is set post-scoring)
|
||||
// This differs from TimelinesSharedFeatures.INJECTION_TYPE, which is only
|
||||
// set to Recap or Rectweet, and is available pre-scoring.
|
||||
// This also differs from TimeFeatures.IS_TWEET_RECYCLED, which is set
|
||||
// pre-scoring and indicates if a tweet is being considered for recycling.
|
||||
// In contrast, DISPLAY_SUGGEST_TYPE == RecycledTweet means the tweet
|
||||
// was actually served in a recycled tweet module. The two should currently
|
||||
// have the same value, but need not in future, so please only use
|
||||
// IS_TWEET_RECYCLED/CANDIDATE_TWEET_SOURCE_ID for training models and
|
||||
// only use DISPLAY_SUGGEST_TYPE for offline analysis of tweets actually
|
||||
// served in recycled modules.
|
||||
val DISPLAY_SUGGEST_TYPE = new Discrete(name("recap.display.suggest_type"))
|
||||
|
||||
// Candidate tweet source id - related to DISPLAY_SUGGEST_TYPE above, but this is a
|
||||
// property of the candidate rather than display location so is safe to use
|
||||
// in model training, unlike DISPLAY_SUGGEST_TYPE.
|
||||
val CANDIDATE_TWEET_SOURCE_ID =
|
||||
new Discrete(name("timelines.meta.candidate_tweet_source_id"), Set(TweetId).asJava)
|
||||
|
||||
// Was at least 50% of this tweet in the user's viewport for at least 500 ms,
|
||||
// OR did the user engage with the tweet publicly or privately
|
||||
val IS_LINGER_IMPRESSION =
|
||||
new Binary(name("timelines.engagement.is_linger_impression"), Set(EngagementsPrivate).asJava)
|
||||
|
||||
// Features to create rollups
|
||||
val LANGUAGE_GROUP = new Discrete(name("timelines.tweet.text.language_group"))
|
||||
|
||||
// The final position index of the tweet being trained on in the timeline
|
||||
// served from TLM (could still change later in TLS-API), as recorded by
|
||||
// PositionIndexLoggingEnvelopeTransform.
|
||||
val FINAL_POSITION_INDEX = new Discrete(name("timelines.display.final_position_index"))
|
||||
|
||||
// The traceId of the timeline request, can be used to group tweets in the same response.
|
||||
val TRACE_ID = new Discrete(name("timelines.display.trace_id"), Set(TfeTransactionId).asJava)
|
||||
|
||||
// Whether this tweet was randomly injected into the timeline or not, for exploration purposes
|
||||
val IS_RANDOM_TWEET = new Binary(name("timelines.display.is_random_tweet"))
|
||||
|
||||
// Whether this tweet was reordered with softmax ranking for explore/exploit, and needs to
|
||||
// be excluded from exploit only holdback
|
||||
val IS_SOFTMAX_RANKING_TWEET = new Binary(name("timelines.display.is_softmax_ranking_tweet"))
|
||||
|
||||
// Whether the user viewing the tweet has disabled ranked timeline.
|
||||
val IS_RANKED_TIMELINE_DISABLER = new Binary(
|
||||
name("timelines.user_features.is_ranked_timeline_disabler"),
|
||||
Set(AnnotationValue, GeneralSettings).asJava)
|
||||
|
||||
// Whether the user viewing the tweet was one of those released from DDG 4205 control
|
||||
// as part of http://go/shrink-4205 process to shrink the quality features holdback.
|
||||
val IS_USER_RELEASED_FROM_QUALITY_HOLDBACK = new Binary(
|
||||
name("timelines.user_features.is_released_from_quality_holdback"),
|
||||
Set(ExperimentId, ExperimentName).asJava)
|
||||
|
||||
val INITIAL_PREDICTION_FAV =
|
||||
new Continuous(name("timelines.initial_prediction.fav"), Set(EngagementScore).asJava)
|
||||
val INITIAL_PREDICTION_RETWEET =
|
||||
new Continuous(name("timelines.initial_prediction.retweet"), Set(EngagementScore).asJava)
|
||||
val INITIAL_PREDICTION_REPLY =
|
||||
new Continuous(name("timelines.initial_prediction.reply"), Set(EngagementScore).asJava)
|
||||
val INITIAL_PREDICTION_OPEN_LINK =
|
||||
new Continuous(name("timelines.initial_prediction.open_link"), Set(EngagementScore).asJava)
|
||||
val INITIAL_PREDICTION_PROFILE_CLICK =
|
||||
new Continuous(name("timelines.initial_prediction.profile_click"), Set(EngagementScore).asJava)
|
||||
val INITIAL_PREDICTION_VIDEO_PLAYBACK_50 = new Continuous(
|
||||
name("timelines.initial_prediction.video_playback_50"),
|
||||
Set(EngagementScore).asJava)
|
||||
val INITIAL_PREDICTION_DETAIL_EXPAND =
|
||||
new Continuous(name("timelines.initial_prediction.detail_expand"), Set(EngagementScore).asJava)
|
||||
val INITIAL_PREDICTION_PHOTO_EXPAND =
|
||||
new Continuous(name("timelines.initial_prediction.photo_expand"), Set(EngagementScore).asJava)
|
||||
|
||||
val VIEWER_FOLLOWS_ORIGINAL_AUTHOR =
|
||||
new Binary(name("timelines.viewer_follows_original_author"), Set(Follow).asJava)
|
||||
|
||||
val IS_TOP_ONE = new Binary(name("timelines.position.is_top_one"))
|
||||
val IS_TOP_FIVE =
|
||||
new Binary(name(featureName = "timelines.position.is_top_five"))
|
||||
val IS_TOP_TEN =
|
||||
new Binary(name(featureName = "timelines.position.is_top_ten"))
|
||||
|
||||
val LOG_POSITION =
|
||||
new Continuous(name(featureName = "timelines.position.log_10"))
|
||||
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
"src/thrift/com/twitter/timelineservice/server/suggests/features/engagement_features:thrift-scala",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
"timelines/data_processing/ml_util/transforms",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,246 @@
|
|||
package com.twitter.timelines.prediction.features.engagement_features
|
||||
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType._
|
||||
import com.twitter.logging.Logger
|
||||
import com.twitter.ml.api.DataRecord
|
||||
import com.twitter.ml.api.Feature
|
||||
import com.twitter.ml.api.Feature.Continuous
|
||||
import com.twitter.ml.api.Feature.SparseBinary
|
||||
import com.twitter.timelines.data_processing.ml_util.transforms.OneToSomeTransform
|
||||
import com.twitter.timelines.data_processing.ml_util.transforms.RichITransform
|
||||
import com.twitter.timelines.data_processing.ml_util.transforms.SparseBinaryUnion
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
|
||||
import com.twitter.timelineservice.suggests.features.engagement_features.thriftscala.{
|
||||
EngagementFeatures => ThriftEngagementFeatures
|
||||
}
|
||||
import com.twitter.timelineservice.suggests.features.engagement_features.v1.thriftscala.{
|
||||
EngagementFeatures => ThriftEngagementFeaturesV1
|
||||
}
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object EngagementFeatures {
|
||||
private[this] val logger = Logger.get(getClass.getSimpleName)
|
||||
|
||||
sealed trait EngagementFeature
|
||||
case object Count extends EngagementFeature
|
||||
case object RealGraphWeightAverage extends EngagementFeature
|
||||
case object RealGraphWeightMax extends EngagementFeature
|
||||
case object RealGraphWeightMin extends EngagementFeature
|
||||
case object RealGraphWeightMissing extends EngagementFeature
|
||||
case object RealGraphWeightVariance extends EngagementFeature
|
||||
case object UserIds extends EngagementFeature
|
||||
|
||||
def fromThrift(thriftEngagementFeatures: ThriftEngagementFeatures): Option[EngagementFeatures] = {
|
||||
thriftEngagementFeatures match {
|
||||
case thriftEngagementFeaturesV1: ThriftEngagementFeatures.V1 =>
|
||||
Some(
|
||||
EngagementFeatures(
|
||||
favoritedBy = thriftEngagementFeaturesV1.v1.favoritedBy,
|
||||
retweetedBy = thriftEngagementFeaturesV1.v1.retweetedBy,
|
||||
repliedBy = thriftEngagementFeaturesV1.v1.repliedBy,
|
||||
)
|
||||
)
|
||||
case _ => {
|
||||
logger.error("Unexpected EngagementFeatures version found.")
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
val empty: EngagementFeatures = EngagementFeatures()
|
||||
}
|
||||
|
||||
/**
|
||||
* Contains user IDs who have engaged with a target entity, such as a Tweet,
|
||||
* and any additional data needed for derived features.
|
||||
*/
|
||||
case class EngagementFeatures(
|
||||
favoritedBy: Seq[Long] = Nil,
|
||||
retweetedBy: Seq[Long] = Nil,
|
||||
repliedBy: Seq[Long] = Nil,
|
||||
realGraphWeightByUser: Map[Long, Double] = Map.empty) {
|
||||
def isEmpty: Boolean = favoritedBy.isEmpty && retweetedBy.isEmpty && repliedBy.isEmpty
|
||||
def nonEmpty: Boolean = !isEmpty
|
||||
def toLogThrift: ThriftEngagementFeatures.V1 =
|
||||
ThriftEngagementFeatures.V1(
|
||||
ThriftEngagementFeaturesV1(
|
||||
favoritedBy = favoritedBy,
|
||||
retweetedBy = retweetedBy,
|
||||
repliedBy = repliedBy
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents engagement features derived from the Real Graph weight.
|
||||
*
|
||||
* These features are from the perspective of the source user, who is viewing their
|
||||
* timeline, to the destination users (or user), who created engagements.
|
||||
*
|
||||
* @param count number of engagements present
|
||||
* @param max max score of the engaging users
|
||||
* @param mean average score of the engaging users
|
||||
* @param min minimum score of the engaging users
|
||||
* @param missing for engagements present, how many Real Graph scores were missing
|
||||
* @param variance variance of scores of the engaging users
|
||||
*/
|
||||
case class RealGraphDerivedEngagementFeatures(
|
||||
count: Int,
|
||||
max: Double,
|
||||
mean: Double,
|
||||
min: Double,
|
||||
missing: Int,
|
||||
variance: Double)
|
||||
|
||||
object EngagementDataRecordFeatures {
|
||||
import EngagementFeatures._
|
||||
|
||||
val FavoritedByUserIds = new SparseBinary(
|
||||
"engagement_features.user_ids.favorited_by",
|
||||
Set(UserId, PrivateLikes, PublicLikes).asJava)
|
||||
val RetweetedByUserIds = new SparseBinary(
|
||||
"engagement_features.user_ids.retweeted_by",
|
||||
Set(UserId, PrivateRetweets, PublicRetweets).asJava)
|
||||
val RepliedByUserIds = new SparseBinary(
|
||||
"engagement_features.user_ids.replied_by",
|
||||
Set(UserId, PrivateReplies, PublicReplies).asJava)
|
||||
|
||||
val InNetworkFavoritesCount = new Continuous(
|
||||
"engagement_features.in_network.favorites.count",
|
||||
Set(CountOfPrivateLikes, CountOfPublicLikes).asJava)
|
||||
val InNetworkRetweetsCount = new Continuous(
|
||||
"engagement_features.in_network.retweets.count",
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava)
|
||||
val InNetworkRepliesCount = new Continuous(
|
||||
"engagement_features.in_network.replies.count",
|
||||
Set(CountOfPrivateReplies, CountOfPublicReplies).asJava)
|
||||
|
||||
// real graph derived features
|
||||
val InNetworkFavoritesAvgRealGraphWeight = new Continuous(
|
||||
"engagement_features.real_graph.favorites.avg_weight",
|
||||
Set(CountOfPrivateLikes, CountOfPublicLikes).asJava
|
||||
)
|
||||
val InNetworkFavoritesMaxRealGraphWeight = new Continuous(
|
||||
"engagement_features.real_graph.favorites.max_weight",
|
||||
Set(CountOfPrivateLikes, CountOfPublicLikes).asJava
|
||||
)
|
||||
val InNetworkFavoritesMinRealGraphWeight = new Continuous(
|
||||
"engagement_features.real_graph.favorites.min_weight",
|
||||
Set(CountOfPrivateLikes, CountOfPublicLikes).asJava
|
||||
)
|
||||
val InNetworkFavoritesRealGraphWeightMissing = new Continuous(
|
||||
"engagement_features.real_graph.favorites.missing"
|
||||
)
|
||||
val InNetworkFavoritesRealGraphWeightVariance = new Continuous(
|
||||
"engagement_features.real_graph.favorites.weight_variance"
|
||||
)
|
||||
|
||||
val InNetworkRetweetsMaxRealGraphWeight = new Continuous(
|
||||
"engagement_features.real_graph.retweets.max_weight",
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava
|
||||
)
|
||||
val InNetworkRetweetsMinRealGraphWeight = new Continuous(
|
||||
"engagement_features.real_graph.retweets.min_weight",
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava
|
||||
)
|
||||
val InNetworkRetweetsAvgRealGraphWeight = new Continuous(
|
||||
"engagement_features.real_graph.retweets.avg_weight",
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava
|
||||
)
|
||||
val InNetworkRetweetsRealGraphWeightMissing = new Continuous(
|
||||
"engagement_features.real_graph.retweets.missing"
|
||||
)
|
||||
val InNetworkRetweetsRealGraphWeightVariance = new Continuous(
|
||||
"engagement_features.real_graph.retweets.weight_variance"
|
||||
)
|
||||
|
||||
val InNetworkRepliesMaxRealGraphWeight = new Continuous(
|
||||
"engagement_features.real_graph.replies.max_weight",
|
||||
Set(CountOfPrivateReplies, CountOfPublicReplies).asJava
|
||||
)
|
||||
val InNetworkRepliesMinRealGraphWeight = new Continuous(
|
||||
"engagement_features.real_graph.replies.min_weight",
|
||||
Set(CountOfPrivateReplies, CountOfPublicReplies).asJava
|
||||
)
|
||||
val InNetworkRepliesAvgRealGraphWeight = new Continuous(
|
||||
"engagement_features.real_graph.replies.avg_weight",
|
||||
Set(CountOfPrivateReplies, CountOfPublicReplies).asJava
|
||||
)
|
||||
val InNetworkRepliesRealGraphWeightMissing = new Continuous(
|
||||
"engagement_features.real_graph.replies.missing"
|
||||
)
|
||||
val InNetworkRepliesRealGraphWeightVariance = new Continuous(
|
||||
"engagement_features.real_graph.replies.weight_variance"
|
||||
)
|
||||
|
||||
sealed trait FeatureGroup {
|
||||
def continuousFeatures: Map[EngagementFeature, Continuous]
|
||||
def sparseBinaryFeatures: Map[EngagementFeature, SparseBinary]
|
||||
def allFeatures: Seq[Feature[_]] =
|
||||
(continuousFeatures.values ++ sparseBinaryFeatures.values).toSeq
|
||||
}
|
||||
|
||||
case object Favorites extends FeatureGroup {
|
||||
override val continuousFeatures: Map[EngagementFeature, Continuous] =
|
||||
Map(
|
||||
Count -> InNetworkFavoritesCount,
|
||||
RealGraphWeightAverage -> InNetworkFavoritesAvgRealGraphWeight,
|
||||
RealGraphWeightMax -> InNetworkFavoritesMaxRealGraphWeight,
|
||||
RealGraphWeightMin -> InNetworkFavoritesMinRealGraphWeight,
|
||||
RealGraphWeightMissing -> InNetworkFavoritesRealGraphWeightMissing,
|
||||
RealGraphWeightVariance -> InNetworkFavoritesRealGraphWeightVariance
|
||||
)
|
||||
|
||||
override val sparseBinaryFeatures: Map[EngagementFeature, SparseBinary] =
|
||||
Map(UserIds -> FavoritedByUserIds)
|
||||
}
|
||||
|
||||
case object Retweets extends FeatureGroup {
|
||||
override val continuousFeatures: Map[EngagementFeature, Continuous] =
|
||||
Map(
|
||||
Count -> InNetworkRetweetsCount,
|
||||
RealGraphWeightAverage -> InNetworkRetweetsAvgRealGraphWeight,
|
||||
RealGraphWeightMax -> InNetworkRetweetsMaxRealGraphWeight,
|
||||
RealGraphWeightMin -> InNetworkRetweetsMinRealGraphWeight,
|
||||
RealGraphWeightMissing -> InNetworkRetweetsRealGraphWeightMissing,
|
||||
RealGraphWeightVariance -> InNetworkRetweetsRealGraphWeightVariance
|
||||
)
|
||||
|
||||
override val sparseBinaryFeatures: Map[EngagementFeature, SparseBinary] =
|
||||
Map(UserIds -> RetweetedByUserIds)
|
||||
}
|
||||
|
||||
case object Replies extends FeatureGroup {
|
||||
override val continuousFeatures: Map[EngagementFeature, Continuous] =
|
||||
Map(
|
||||
Count -> InNetworkRepliesCount,
|
||||
RealGraphWeightAverage -> InNetworkRepliesAvgRealGraphWeight,
|
||||
RealGraphWeightMax -> InNetworkRepliesMaxRealGraphWeight,
|
||||
RealGraphWeightMin -> InNetworkRepliesMinRealGraphWeight,
|
||||
RealGraphWeightMissing -> InNetworkRepliesRealGraphWeightMissing,
|
||||
RealGraphWeightVariance -> InNetworkRepliesRealGraphWeightVariance
|
||||
)
|
||||
|
||||
override val sparseBinaryFeatures: Map[EngagementFeature, SparseBinary] =
|
||||
Map(UserIds -> RepliedByUserIds)
|
||||
}
|
||||
|
||||
val PublicEngagerSets = Set(FavoritedByUserIds, RetweetedByUserIds, RepliedByUserIds)
|
||||
val PublicEngagementUserIds = new SparseBinary(
|
||||
"engagement_features.user_ids.public",
|
||||
Set(UserId, EngagementsPublic).asJava
|
||||
)
|
||||
val ENGAGER_ID = TypedAggregateGroup.sparseFeature(PublicEngagementUserIds)
|
||||
|
||||
val UnifyPublicEngagersTransform = SparseBinaryUnion(
|
||||
featuresToUnify = PublicEngagerSets,
|
||||
outputFeature = PublicEngagementUserIds
|
||||
)
|
||||
|
||||
object RichUnifyPublicEngagersTransform extends OneToSomeTransform {
|
||||
override def apply(dataRecord: DataRecord): Option[DataRecord] =
|
||||
RichITransform(EngagementDataRecordFeatures.UnifyPublicEngagersTransform)(dataRecord)
|
||||
override def featuresToTransform: Set[Feature[_]] =
|
||||
EngagementDataRecordFeatures.UnifyPublicEngagersTransform.featuresToUnify.toSet
|
||||
}
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/thrift/com/twitter/tweetypie:tweet-scala",
|
||||
],
|
||||
)
|
||||
|
||||
scala_library(
|
||||
name = "escherbird-features",
|
||||
sources = ["EscherbirdFeatures.scala"],
|
||||
tags = ["bazel-only"],
|
||||
dependencies = [
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,19 @@
|
|||
package com.twitter.timelines.prediction.features.escherbird
|
||||
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType._
|
||||
import com.twitter.ml.api.Feature
|
||||
import java.util.{Set => JSet}
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object EscherbirdFeatures {
|
||||
val TweetGroupIds = new Feature.SparseBinary("escherbird.tweet_group_ids")
|
||||
val TweetDomainIds = new Feature.SparseBinary("escherbird.tweet_domain_ids", Set(DomainId).asJava)
|
||||
val TweetEntityIds =
|
||||
new Feature.SparseBinary("escherbird.tweet_entity_ids", Set(SemanticcoreClassification).asJava)
|
||||
}
|
||||
|
||||
case class EscherbirdFeatures(
|
||||
tweetId: Long,
|
||||
tweetGroupIds: JSet[String],
|
||||
tweetDomainIds: JSet[String],
|
||||
tweetEntityIds: JSet[String])
|
|
@ -0,0 +1,19 @@
|
|||
package com.twitter.timelines.prediction.features.escherbird
|
||||
|
||||
import com.twitter.tweetypie.thriftscala.Tweet
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object EscherbirdFeaturesConverter {
|
||||
val DeprecatedOrTestDomains = Set(1L, 5L, 7L, 9L, 14L, 19L, 20L, 31L)
|
||||
|
||||
def fromTweet(tweet: Tweet): Option[EscherbirdFeatures] = tweet.escherbirdEntityAnnotations.map {
|
||||
escherbirdEntityAnnotations =>
|
||||
val annotations = escherbirdEntityAnnotations.entityAnnotations
|
||||
.filterNot(annotation => DeprecatedOrTestDomains.contains(annotation.domainId))
|
||||
val tweetGroupIds = annotations.map(_.groupId.toString).toSet.asJava
|
||||
val tweetDomainIds = annotations.map(_.domainId.toString).toSet.asJava
|
||||
// An entity is only unique within a given domain
|
||||
val tweetEntityIds = annotations.map(a => s"${a.domainId}.${a.entityId}").toSet.asJava
|
||||
EscherbirdFeatures(tweet.id, tweetGroupIds, tweetDomainIds, tweetEntityIds)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
dependencies = [
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,53 @@
|
|||
package com.twitter.timelines.prediction.features.followsource
|
||||
|
||||
import com.twitter.ml.api.Feature
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType._
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object FollowSourceFeatures {
|
||||
|
||||
// Corresponds to an algorithm constant from com.twitter.hermit.profile.HermitProfileConstants
|
||||
val FollowSourceAlgorithm = new Feature.Text("follow_source.algorithm")
|
||||
|
||||
// Type of follow action: one of "unfollow", "follow", "follow_back", "follow_many", "follow_all"
|
||||
val FollowAction = new Feature.Text(
|
||||
"follow_source.action",
|
||||
Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava)
|
||||
|
||||
// Millisecond timestamp when follow occurred
|
||||
val FollowTimestamp =
|
||||
new Feature.Discrete("follow_source.follow_timestamp", Set(Follow, PrivateTimestamp).asJava)
|
||||
|
||||
// Age of follow (in minutes)
|
||||
val FollowAgeMinutes =
|
||||
new Feature.Continuous("follow_source.follow_age_minutes", Set(Follow).asJava)
|
||||
|
||||
// Tweet ID of tweet details page from where follow happened (if applicable)
|
||||
val FollowCauseTweetId = new Feature.Discrete("follow_source.cause_tweet_id", Set(TweetId).asJava)
|
||||
|
||||
// String representation of follow client (android, web, iphone, etc). Derived from "client"
|
||||
// portion of client event namespace.
|
||||
val FollowClientId = new Feature.Text("follow_source.client_id", Set(ClientType).asJava)
|
||||
|
||||
// If the follow happens via a profile's Following or Followers,
|
||||
// the id of the profile owner is recorded here.
|
||||
val FollowAssociationId =
|
||||
new Feature.Discrete("follow_source.association_id", Set(Follow, UserId).asJava)
|
||||
|
||||
// The "friendly name" here is computed using FollowSourceUtil.getSource. It represents
|
||||
// a grouping on a few client events that reflect where the event occurred. For example,
|
||||
// events on the tweet details page are grouped using "tweetDetails":
|
||||
// case (Some("web"), Some("permalink"), _, _, _) => "tweetDetails"
|
||||
// case (Some("iphone"), Some("tweet"), _, _, _) => "tweetDetails"
|
||||
// case (Some("android"), Some("tweet"), _, _, _) => "tweetDetails"
|
||||
val FollowSourceFriendlyName = new Feature.Text("follow_source.friendly_name", Set(Follow).asJava)
|
||||
|
||||
// Up to two sources and actions that preceded the follow (for example, a profile visit
|
||||
// through a mention click, which itself was on a tweet detail page reached through a tweet
|
||||
// click in the Home tab). See go/followsource for more details and examples.
|
||||
// The "source" here is computed using FollowSourceUtil.getSource
|
||||
val PreFollowAction1 = new Feature.Text("follow_source.pre_follow_action_1", Set(Follow).asJava)
|
||||
val PreFollowAction2 = new Feature.Text("follow_source.pre_follow_action_2", Set(Follow).asJava)
|
||||
val PreFollowSource1 = new Feature.Text("follow_source.pre_follow_source_1", Set(Follow).asJava)
|
||||
val PreFollowSource2 = new Feature.Text("follow_source.pre_follow_source_2", Set(Follow).asJava)
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,575 @@
|
|||
package com.twitter.timelines.prediction.features.itl
|
||||
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType._
|
||||
import com.twitter.ml.api.Feature.Binary
|
||||
import com.twitter.ml.api.Feature.Continuous
|
||||
import com.twitter.ml.api.Feature.Discrete
|
||||
import com.twitter.ml.api.Feature.SparseBinary
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object ITLFeatures {
|
||||
// engagement
|
||||
val IS_RETWEETED =
|
||||
new Binary("itl.engagement.is_retweeted", Set(PublicRetweets, PrivateRetweets).asJava)
|
||||
val IS_FAVORITED =
|
||||
new Binary("itl.engagement.is_favorited", Set(PublicLikes, PrivateLikes).asJava)
|
||||
val IS_REPLIED =
|
||||
new Binary("itl.engagement.is_replied", Set(PublicReplies, PrivateReplies).asJava)
|
||||
// v1: post click engagements: fav, reply
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_V1 = new Binary(
|
||||
"itl.engagement.is_good_clicked_convo_desc_favorited_or_replied",
|
||||
Set(
|
||||
PublicLikes,
|
||||
PrivateLikes,
|
||||
PublicReplies,
|
||||
PrivateReplies,
|
||||
EngagementsPrivate,
|
||||
EngagementsPublic).asJava)
|
||||
// v2: post click engagements: click
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_V2 = new Binary(
|
||||
"itl.engagement.is_good_clicked_convo_desc_v2",
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_FAVORITED = new Binary(
|
||||
"itl.engagement.is_good_clicked_convo_desc_favorited",
|
||||
Set(PublicLikes, PrivateLikes).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_REPLIED = new Binary(
|
||||
"itl.engagement.is_good_clicked_convo_desc_replied",
|
||||
Set(PublicReplies, PrivateReplies).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_RETWEETED = new Binary(
|
||||
"itl.engagement.is_good_clicked_convo_desc_retweeted",
|
||||
Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_CLICKED = new Binary(
|
||||
"itl.engagement.is_good_clicked_convo_desc_clicked",
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_FOLLOWED =
|
||||
new Binary("itl.engagement.is_good_clicked_convo_desc_followed", Set(EngagementsPrivate).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_SHARE_DM_CLICKED = new Binary(
|
||||
"itl.engagement.is_good_clicked_convo_desc_share_dm_clicked",
|
||||
Set(EngagementsPrivate).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_PROFILE_CLICKED = new Binary(
|
||||
"itl.engagement.is_good_clicked_convo_desc_profile_clicked",
|
||||
Set(EngagementsPrivate).asJava)
|
||||
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_0 = new Binary(
|
||||
"itl.engagement.is_good_clicked_convo_desc_uam_gt_0",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_1 = new Binary(
|
||||
"itl.engagement.is_good_clicked_convo_desc_uam_gt_1",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_2 = new Binary(
|
||||
"itl.engagement.is_good_clicked_convo_desc_uam_gt_2",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_3 = new Binary(
|
||||
"itl.engagement.is_good_clicked_convo_desc_uam_gt_3",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
|
||||
val IS_TWEET_DETAIL_DWELLED = new Binary(
|
||||
"itl.engagement.is_tweet_detail_dwelled",
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_TWEET_DETAIL_DWELLED_8_SEC = new Binary(
|
||||
"itl.engagement.is_tweet_detail_dwelled_8_sec",
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_TWEET_DETAIL_DWELLED_15_SEC = new Binary(
|
||||
"itl.engagement.is_tweet_detail_dwelled_15_sec",
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_TWEET_DETAIL_DWELLED_25_SEC = new Binary(
|
||||
"itl.engagement.is_tweet_detail_dwelled_25_sec",
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_TWEET_DETAIL_DWELLED_30_SEC = new Binary(
|
||||
"itl.engagement.is_tweet_detail_dwelled_30_sec",
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_PROFILE_DWELLED = new Binary(
|
||||
"itl.engagement.is_profile_dwelled",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_DWELLED_10_SEC = new Binary(
|
||||
"itl.engagement.is_profile_dwelled_10_sec",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_DWELLED_20_SEC = new Binary(
|
||||
"itl.engagement.is_profile_dwelled_20_sec",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_DWELLED_30_SEC = new Binary(
|
||||
"itl.engagement.is_profile_dwelled_30_sec",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_FULLSCREEN_VIDEO_DWELLED = new Binary(
|
||||
"itl.engagement.is_fullscreen_video_dwelled",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_FULLSCREEN_VIDEO_DWELLED_5_SEC = new Binary(
|
||||
"itl.engagement.is_fullscreen_video_dwelled_5_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_FULLSCREEN_VIDEO_DWELLED_10_SEC = new Binary(
|
||||
"itl.engagement.is_fullscreen_video_dwelled_10_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_FULLSCREEN_VIDEO_DWELLED_20_SEC = new Binary(
|
||||
"itl.engagement.is_fullscreen_video_dwelled_20_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_FULLSCREEN_VIDEO_DWELLED_30_SEC = new Binary(
|
||||
"itl.engagement.is_fullscreen_video_dwelled_30_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_LINK_DWELLED_15_SEC = new Binary(
|
||||
"itl.engagement.is_link_dwelled_15_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_LINK_DWELLED_30_SEC = new Binary(
|
||||
"itl.engagement.is_link_dwelled_30_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_LINK_DWELLED_60_SEC = new Binary(
|
||||
"itl.engagement.is_link_dwelled_60_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_QUOTED =
|
||||
new Binary("itl.engagement.is_quoted", Set(PublicRetweets, PrivateRetweets).asJava)
|
||||
val IS_RETWEETED_WITHOUT_QUOTE = new Binary(
|
||||
"itl.engagement.is_retweeted_without_quote",
|
||||
Set(PublicRetweets, PrivateRetweets).asJava)
|
||||
val IS_CLICKED = new Binary(
|
||||
"itl.engagement.is_clicked",
|
||||
Set(EngagementsPrivate, TweetsClicked, LinksClickedOn).asJava)
|
||||
val IS_PROFILE_CLICKED = new Binary(
|
||||
"itl.engagement.is_profile_clicked",
|
||||
Set(EngagementsPrivate, TweetsClicked, ProfilesViewed, ProfilesClicked).asJava)
|
||||
val IS_DWELLED = new Binary("itl.engagement.is_dwelled", Set(EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_IN_BOUNDS_V1 =
|
||||
new Binary("itl.engagement.is_dwelled_in_bounds_v1", Set(EngagementsPrivate).asJava)
|
||||
val DWELL_NORMALIZED_OVERALL =
|
||||
new Continuous("itl.engagement.dwell_normalized_overall", Set(EngagementsPrivate).asJava)
|
||||
val DWELL_CDF_OVERALL =
|
||||
new Continuous("itl.engagement.dwell_cdf_overall", Set(EngagementsPrivate).asJava)
|
||||
val DWELL_CDF = new Continuous("itl.engagement.dwell_cdf", Set(EngagementsPrivate).asJava)
|
||||
|
||||
val IS_DWELLED_1S = new Binary("itl.engagement.is_dwelled_1s", Set(EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_2S = new Binary("itl.engagement.is_dwelled_2s", Set(EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_3S = new Binary("itl.engagement.is_dwelled_3s", Set(EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_4S = new Binary("itl.engagement.is_dwelled_4s", Set(EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_5S = new Binary("itl.engagement.is_dwelled_5s", Set(EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_6S = new Binary("itl.engagement.is_dwelled_6s", Set(EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_7S = new Binary("itl.engagement.is_dwelled_7s", Set(EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_8S = new Binary("itl.engagement.is_dwelled_8s", Set(EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_9S = new Binary("itl.engagement.is_dwelled_9s", Set(EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_10S = new Binary("itl.engagement.is_dwelled_10s", Set(EngagementsPrivate).asJava)
|
||||
|
||||
val IS_SKIPPED_1S = new Binary("itl.engagement.is_skipped_1s", Set(EngagementsPrivate).asJava)
|
||||
val IS_SKIPPED_2S = new Binary("itl.engagement.is_skipped_2s", Set(EngagementsPrivate).asJava)
|
||||
val IS_SKIPPED_3S = new Binary("itl.engagement.is_skipped_3s", Set(EngagementsPrivate).asJava)
|
||||
val IS_SKIPPED_4S = new Binary("itl.engagement.is_skipped_4s", Set(EngagementsPrivate).asJava)
|
||||
val IS_SKIPPED_5S = new Binary("itl.engagement.is_skipped_5s", Set(EngagementsPrivate).asJava)
|
||||
val IS_SKIPPED_6S = new Binary("itl.engagement.is_skipped_6s", Set(EngagementsPrivate).asJava)
|
||||
val IS_SKIPPED_7S = new Binary("itl.engagement.is_skipped_7s", Set(EngagementsPrivate).asJava)
|
||||
val IS_SKIPPED_8S = new Binary("itl.engagement.is_skipped_8s", Set(EngagementsPrivate).asJava)
|
||||
val IS_SKIPPED_9S = new Binary("itl.engagement.is_skipped_9s", Set(EngagementsPrivate).asJava)
|
||||
val IS_SKIPPED_10S = new Binary("itl.engagement.is_skipped_10s", Set(EngagementsPrivate).asJava)
|
||||
|
||||
val IS_FOLLOWED =
|
||||
new Binary("itl.engagement.is_followed", Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_IMPRESSED = new Binary("itl.engagement.is_impressed", Set(EngagementsPrivate).asJava)
|
||||
val IS_OPEN_LINKED =
|
||||
new Binary("itl.engagement.is_open_linked", Set(EngagementsPrivate, LinksClickedOn).asJava)
|
||||
val IS_PHOTO_EXPANDED = new Binary(
|
||||
"itl.engagement.is_photo_expanded",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_VIDEO_VIEWED =
|
||||
new Binary("itl.engagement.is_video_viewed", Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_VIDEO_PLAYBACK_50 = new Binary(
|
||||
"itl.engagement.is_video_playback_50",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_VIDEO_QUALITY_VIEWED = new Binary(
|
||||
"itl.engagement.is_video_quality_viewed",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava
|
||||
)
|
||||
val IS_BOOKMARKED =
|
||||
new Binary("itl.engagement.is_bookmarked", Set(EngagementsPrivate).asJava)
|
||||
val IS_SHARED =
|
||||
new Binary("itl.engagement.is_shared", Set(EngagementsPrivate).asJava)
|
||||
val IS_SHARE_MENU_CLICKED =
|
||||
new Binary("itl.engagement.is_share_menu_clicked", Set(EngagementsPrivate).asJava)
|
||||
|
||||
// Negative engagements
|
||||
val IS_DONT_LIKE =
|
||||
new Binary("itl.engagement.is_dont_like", Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_BLOCK_CLICKED = new Binary(
|
||||
"itl.engagement.is_block_clicked",
|
||||
Set(TweetsClicked, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_BLOCK_DIALOG_BLOCKED = new Binary(
|
||||
"itl.engagement.is_block_dialog_blocked",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_MUTE_CLICKED =
|
||||
new Binary("itl.engagement.is_mute_clicked", Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_MUTE_DIALOG_MUTED =
|
||||
new Binary("itl.engagement.is_mute_dialog_muted", Set(EngagementsPrivate).asJava)
|
||||
val IS_REPORT_TWEET_CLICKED = new Binary(
|
||||
"itl.engagement.is_report_tweet_clicked",
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_CARET_CLICKED =
|
||||
new Binary("itl.engagement.is_caret_clicked", Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_NOT_ABOUT_TOPIC =
|
||||
new Binary("itl.engagement.is_not_about_topic", Set(EngagementsPrivate).asJava)
|
||||
val IS_NOT_RECENT =
|
||||
new Binary("itl.engagement.is_not_recent", Set(EngagementsPrivate).asJava)
|
||||
val IS_NOT_RELEVANT =
|
||||
new Binary("itl.engagement.is_not_relevant", Set(EngagementsPrivate).asJava)
|
||||
val IS_SEE_FEWER =
|
||||
new Binary("itl.engagement.is_see_fewer", Set(EngagementsPrivate).asJava)
|
||||
val IS_UNFOLLOW_TOPIC =
|
||||
new Binary("itl.engagement.is_unfollow_topic", Set(EngagementsPrivate).asJava)
|
||||
val IS_FOLLOW_TOPIC =
|
||||
new Binary("itl.engagement.is_follow_topic", Set(EngagementsPrivate).asJava)
|
||||
val IS_NOT_INTERESTED_IN_TOPIC =
|
||||
new Binary("itl.engagement.is_not_interested_in_topic", Set(EngagementsPrivate).asJava)
|
||||
val IS_HOME_LATEST_VISITED =
|
||||
new Binary("itl.engagement.is_home_latest_visited", Set(EngagementsPrivate).asJava)
|
||||
|
||||
// This derived label is the logical OR of IS_DONT_LIKE, IS_BLOCK_CLICKED, IS_MUTE_CLICKED and IS_REPORT_TWEET_CLICKED
|
||||
val IS_NEGATIVE_FEEDBACK =
|
||||
new Binary("itl.engagement.is_negative_feedback", Set(EngagementsPrivate).asJava)
|
||||
|
||||
// Reciprocal engagements for reply forward engagement
|
||||
val IS_REPLIED_REPLY_IMPRESSED_BY_AUTHOR = new Binary(
|
||||
"itl.engagement.is_replied_reply_impressed_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_REPLIED_REPLY_FAVORITED_BY_AUTHOR = new Binary(
|
||||
"itl.engagement.is_replied_reply_favorited_by_author",
|
||||
Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_REPLIED_REPLY_QUOTED_BY_AUTHOR = new Binary(
|
||||
"itl.engagement.is_replied_reply_quoted_by_author",
|
||||
Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_REPLIED_REPLY_REPLIED_BY_AUTHOR = new Binary(
|
||||
"itl.engagement.is_replied_reply_replied_by_author",
|
||||
Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_REPLIED_REPLY_RETWEETED_BY_AUTHOR = new Binary(
|
||||
"itl.engagement.is_replied_reply_retweeted_by_author",
|
||||
Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_REPLIED_REPLY_BLOCKED_BY_AUTHOR = new Binary(
|
||||
"itl.engagement.is_replied_reply_blocked_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_REPLIED_REPLY_FOLLOWED_BY_AUTHOR = new Binary(
|
||||
"itl.engagement.is_replied_reply_followed_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_REPLIED_REPLY_UNFOLLOWED_BY_AUTHOR = new Binary(
|
||||
"itl.engagement.is_replied_reply_unfollowed_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_REPLIED_REPLY_MUTED_BY_AUTHOR = new Binary(
|
||||
"itl.engagement.is_replied_reply_muted_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_REPLIED_REPLY_REPORTED_BY_AUTHOR = new Binary(
|
||||
"itl.engagement.is_replied_reply_reported_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
|
||||
// This derived label is the logical OR of REPLY_REPLIED, REPLY_FAVORITED, REPLY_RETWEETED
|
||||
val IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR = new Binary(
|
||||
"itl.engagement.is_replied_reply_engaged_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
|
||||
// Reciprocal engagements for fav forward engagement
|
||||
val IS_FAVORITED_FAV_FAVORITED_BY_AUTHOR = new Binary(
|
||||
"itl.engagement.is_favorited_fav_favorited_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic, PrivateLikes, PublicLikes).asJava
|
||||
)
|
||||
val IS_FAVORITED_FAV_REPLIED_BY_AUTHOR = new Binary(
|
||||
"itl.engagement.is_favorited_fav_replied_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic, PrivateReplies, PublicReplies).asJava
|
||||
)
|
||||
val IS_FAVORITED_FAV_RETWEETED_BY_AUTHOR = new Binary(
|
||||
"itl.engagement.is_favorited_fav_retweeted_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava
|
||||
)
|
||||
val IS_FAVORITED_FAV_FOLLOWED_BY_AUTHOR = new Binary(
|
||||
"itl.engagement.is_favorited_fav_followed_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava
|
||||
)
|
||||
// This derived label is the logical OR of FAV_REPLIED, FAV_FAVORITED, FAV_RETWEETED, FAV_FOLLOWED
|
||||
val IS_FAVORITED_FAV_ENGAGED_BY_AUTHOR = new Binary(
|
||||
"itl.engagement.is_favorited_fav_engaged_by_author",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava
|
||||
)
|
||||
|
||||
// define good profile click by considering following engagements (follow, fav, reply, retweet, etc.) at profile page
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_FOLLOW = new Binary(
|
||||
"itl.engagement.is_profile_clicked_and_profile_follow",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, Follow).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_FAV = new Binary(
|
||||
"itl.engagement.is_profile_clicked_and_profile_fav",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, PrivateLikes, PublicLikes).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_REPLY = new Binary(
|
||||
"itl.engagement.is_profile_clicked_and_profile_reply",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, PrivateReplies, PublicReplies).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_RETWEET = new Binary(
|
||||
"itl.engagement.is_profile_clicked_and_profile_retweet",
|
||||
Set(
|
||||
ProfilesViewed,
|
||||
ProfilesClicked,
|
||||
EngagementsPrivate,
|
||||
PrivateRetweets,
|
||||
PublicRetweets).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_TWEET_CLICK = new Binary(
|
||||
"itl.engagement.is_profile_clicked_and_profile_tweet_click",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, TweetsClicked).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_SHARE_DM_CLICK = new Binary(
|
||||
"itl.engagement.is_profile_clicked_and_profile_share_dm_click",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
// This derived label is the union of all binary features above
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED = new Binary(
|
||||
"itl.engagement.is_profile_clicked_and_profile_engaged",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
|
||||
// define bad profile click by considering following engagements (user report, tweet report, mute, block, etc) at profile page
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_USER_REPORT_CLICK = new Binary(
|
||||
"itl.engagement.is_profile_clicked_and_profile_user_report_click",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_TWEET_REPORT_CLICK = new Binary(
|
||||
"itl.engagement.is_profile_clicked_and_profile_tweet_report_click",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_MUTE = new Binary(
|
||||
"itl.engagement.is_profile_clicked_and_profile_mute",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_BLOCK = new Binary(
|
||||
"itl.engagement.is_profile_clicked_and_profile_block",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
// This derived label is the union of bad profile click engagements and existing negative feedback
|
||||
val IS_NEGATIVE_FEEDBACK_V2 = new Binary(
|
||||
"itl.engagement.is_negative_feedback_v2",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
// engagement for following user from any surface area
|
||||
val IS_FOLLOWED_FROM_ANY_SURFACE_AREA = new Binary(
|
||||
"itl.engagement.is_followed_from_any_surface_area",
|
||||
Set(EngagementsPublic, EngagementsPrivate).asJava)
|
||||
|
||||
// Relevance prompt tweet engagements
|
||||
val IS_RELEVANCE_PROMPT_YES_CLICKED =
|
||||
new Binary("itl.engagement.is_relevance_prompt_yes_clicked", Set(EngagementsPrivate).asJava)
|
||||
|
||||
// Reply downvote engagements
|
||||
val IS_REPLY_DOWNVOTED =
|
||||
new Binary("itl.engagement.is_reply_downvoted", Set(EngagementsPrivate).asJava)
|
||||
val IS_REPLY_DOWNVOTE_REMOVED =
|
||||
new Binary("itl.engagement.is_reply_downvote_removed", Set(EngagementsPrivate).asJava)
|
||||
|
||||
// features from RecommendedTweet
|
||||
val RECTWEET_SCORE = new Continuous("itl.recommended_tweet_features.rectweet_score")
|
||||
val NUM_FAVORITING_USERS = new Continuous("itl.recommended_tweet_features.num_favoriting_users")
|
||||
val NUM_FOLLOWING_USERS = new Continuous("itl.recommended_tweet_features.num_following_users")
|
||||
val CONTENT_SOURCE_TYPE = new Discrete("itl.recommended_tweet_features.content_source_type")
|
||||
|
||||
val RECOS_SCORE = new Continuous(
|
||||
"itl.recommended_tweet_features.recos_score",
|
||||
Set(EngagementScore, UsersRealGraphScore, UsersSalsaScore).asJava)
|
||||
val AUTHOR_REALGRAPH_SCORE = new Continuous(
|
||||
"itl.recommended_tweet_features.realgraph_score",
|
||||
Set(UsersRealGraphScore).asJava)
|
||||
val AUTHOR_SARUS_SCORE = new Continuous(
|
||||
"itl.recommended_tweet_features.sarus_score",
|
||||
Set(EngagementScore, UsersSalsaScore).asJava)
|
||||
|
||||
val NUM_INTERACTING_USERS = new Continuous(
|
||||
"itl.recommended_tweet_features.num_interacting_users",
|
||||
Set(EngagementScore).asJava
|
||||
)
|
||||
val MAX_REALGRAPH_SCORE_OF_INTERACTING_USERS = new Continuous(
|
||||
"itl.recommended_tweet_features.max_realgraph_score_of_interacting_users",
|
||||
Set(UsersRealGraphScore, EngagementScore).asJava
|
||||
)
|
||||
val SUM_REALGRAPH_SCORE_OF_INTERACTING_USERS = new Continuous(
|
||||
"itl.recommended_tweet_features.sum_realgraph_score_of_interacting_users",
|
||||
Set(UsersRealGraphScore, EngagementScore).asJava
|
||||
)
|
||||
val AVG_REALGRAPH_SCORE_OF_INTERACTING_USERS = new Continuous(
|
||||
"itl.recommended_tweet_features.avg_realgraph_score_of_interacting_users",
|
||||
Set(UsersRealGraphScore, EngagementScore).asJava
|
||||
)
|
||||
val MAX_SARUS_SCORE_OF_INTERACTING_USERS = new Continuous(
|
||||
"itl.recommended_tweet_features.max_sarus_score_of_interacting_users",
|
||||
Set(EngagementScore, UsersSalsaScore).asJava
|
||||
)
|
||||
val SUM_SARUS_SCORE_OF_INTERACTING_USERS = new Continuous(
|
||||
"itl.recommended_tweet_features.sum_sarus_score_of_interacting_users",
|
||||
Set(EngagementScore, UsersSalsaScore).asJava
|
||||
)
|
||||
val AVG_SARUS_SCORE_OF_INTERACTING_USERS = new Continuous(
|
||||
"itl.recommended_tweet_features.avg_sarus_score_of_interacting_users",
|
||||
Set(EngagementScore, UsersSalsaScore).asJava
|
||||
)
|
||||
|
||||
val NUM_INTERACTING_FOLLOWINGS = new Continuous(
|
||||
"itl.recommended_tweet_features.num_interacting_followings",
|
||||
Set(EngagementScore).asJava
|
||||
)
|
||||
|
||||
// features from HydratedTweetFeatures
|
||||
val REAL_GRAPH_WEIGHT =
|
||||
new Continuous("itl.hydrated_tweet_features.real_graph_weight", Set(UsersRealGraphScore).asJava)
|
||||
val SARUS_GRAPH_WEIGHT = new Continuous("itl.hydrated_tweet_features.sarus_graph_weight")
|
||||
val FROM_TOP_ENGAGED_USER = new Binary("itl.hydrated_tweet_features.from_top_engaged_user")
|
||||
val FROM_TOP_INFLUENCER = new Binary("itl.hydrated_tweet_features.from_top_influencer")
|
||||
val TOPIC_SIM_SEARCHER_INTERSTED_IN_AUTHOR_KNOWN_FOR = new Continuous(
|
||||
"itl.hydrated_tweet_features.topic_sim_searcher_interested_in_author_known_for"
|
||||
)
|
||||
val TOPIC_SIM_SEARCHER_AUTHOR_BOTH_INTERESTED_IN = new Continuous(
|
||||
"itl.hydrated_tweet_features.topic_sim_searcher_author_both_interested_in"
|
||||
)
|
||||
val TOPIC_SIM_SEARCHER_AUTHOR_BOTH_KNOWN_FOR = new Continuous(
|
||||
"itl.hydrated_tweet_features.topic_sim_searcher_author_both_known_for"
|
||||
)
|
||||
val USER_REP = new Continuous("itl.hydrated_tweet_features.user_rep")
|
||||
val NORMALIZED_PARUS_SCORE = new Continuous("itl.hydrated_tweet_features.normalized_parus_score")
|
||||
val CONTAINS_MEDIA = new Binary("itl.hydrated_tweet_features.contains_media")
|
||||
val FROM_NEARBY = new Binary("itl.hydrated_tweet_features.from_nearby")
|
||||
val TOPIC_SIM_SEARCHER_INTERESTED_IN_TWEET = new Continuous(
|
||||
"itl.hydrated_tweet_features.topic_sim_searcher_interested_in_tweet"
|
||||
)
|
||||
val MATCHES_UI_LANG = new Binary(
|
||||
"itl.hydrated_tweet_features.matches_ui_lang",
|
||||
Set(ProvidedLanguage, InferredLanguage).asJava)
|
||||
val MATCHES_SEARCHER_MAIN_LANG = new Binary(
|
||||
"itl.hydrated_tweet_features.matches_searcher_main_lang",
|
||||
Set(ProvidedLanguage, InferredLanguage).asJava
|
||||
)
|
||||
val MATCHES_SEARCHER_LANGS = new Binary(
|
||||
"itl.hydrated_tweet_features.matches_searcher_langs",
|
||||
Set(ProvidedLanguage, InferredLanguage).asJava)
|
||||
val HAS_CARD = new Binary(
|
||||
"itl.hydrated_tweet_features.has_card",
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_IMAGE = new Binary(
|
||||
"itl.hydrated_tweet_features.has_image",
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_NATIVE_IMAGE = new Binary(
|
||||
"itl.hydrated_tweet_features.has_native_image",
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_VIDEO = new Binary("itl.hydrated_tweet_features.has_video")
|
||||
val HAS_CONSUMER_VIDEO = new Binary(
|
||||
"itl.hydrated_tweet_features.has_consumer_video",
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_PRO_VIDEO = new Binary(
|
||||
"itl.hydrated_tweet_features.has_pro_video",
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_PERISCOPE = new Binary(
|
||||
"itl.hydrated_tweet_features.has_periscope",
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_VINE = new Binary(
|
||||
"itl.hydrated_tweet_features.has_vine",
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_NATIVE_VIDEO = new Binary(
|
||||
"itl.hydrated_tweet_features.has_native_video",
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_LINK = new Binary(
|
||||
"itl.hydrated_tweet_features.has_link",
|
||||
Set(UrlFoundFlag, PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val LINK_COUNT = new Continuous(
|
||||
"itl.hydrated_tweet_features.link_count",
|
||||
Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava)
|
||||
val URL_DOMAINS = new SparseBinary(
|
||||
"itl.hydrated_tweet_features.url_domains",
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_VISIBLE_LINK = new Binary(
|
||||
"itl.hydrated_tweet_features.has_visible_link",
|
||||
Set(UrlFoundFlag, PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_NEWS = new Binary(
|
||||
"itl.hydrated_tweet_features.has_news",
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_TREND = new Binary(
|
||||
"itl.hydrated_tweet_features.has_trend",
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val BLENDER_SCORE =
|
||||
new Continuous("itl.hydrated_tweet_features.blender_score", Set(EngagementScore).asJava)
|
||||
val PARUS_SCORE =
|
||||
new Continuous("itl.hydrated_tweet_features.parus_score", Set(EngagementScore).asJava)
|
||||
val TEXT_SCORE =
|
||||
new Continuous("itl.hydrated_tweet_features.text_score", Set(EngagementScore).asJava)
|
||||
val BIDIRECTIONAL_REPLY_COUNT = new Continuous(
|
||||
"itl.hydrated_tweet_features.bidirectional_reply_count",
|
||||
Set(CountOfPrivateReplies, CountOfPublicReplies).asJava
|
||||
)
|
||||
val UNIDIRECTIONAL_REPLY_COUNT = new Continuous(
|
||||
"itl.hydrated_tweet_features.unidirectional_reply_count",
|
||||
Set(CountOfPrivateReplies, CountOfPublicReplies).asJava
|
||||
)
|
||||
val BIDIRECTIONAL_RETWEET_COUNT = new Continuous(
|
||||
"itl.hydrated_tweet_features.bidirectional_retweet_count",
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava
|
||||
)
|
||||
val UNIDIRECTIONAL_RETWEET_COUNT = new Continuous(
|
||||
"itl.hydrated_tweet_features.unidirectional_retweet_count",
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava
|
||||
)
|
||||
val BIDIRECTIONAL_FAV_COUNT = new Continuous(
|
||||
"itl.hydrated_tweet_features.bidirectional_fav_count",
|
||||
Set(CountOfPrivateLikes, CountOfPublicLikes).asJava
|
||||
)
|
||||
val UNIDIRECTIONAL_FAV_COUNT = new Continuous(
|
||||
"itl.hydrated_tweet_features.unidirectional_fav_count",
|
||||
Set(CountOfPrivateLikes, CountOfPublicLikes).asJava
|
||||
)
|
||||
val CONVERSATION_COUNT = new Continuous("itl.hydrated_tweet_features.conversation_count")
|
||||
val FAV_COUNT = new Continuous(
|
||||
"itl.hydrated_tweet_features.fav_count",
|
||||
Set(CountOfPrivateLikes, CountOfPublicLikes).asJava)
|
||||
val REPLY_COUNT = new Continuous(
|
||||
"itl.hydrated_tweet_features.reply_count",
|
||||
Set(CountOfPrivateReplies, CountOfPublicReplies).asJava)
|
||||
val RETWEET_COUNT = new Continuous(
|
||||
"itl.hydrated_tweet_features.retweet_count",
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava)
|
||||
val PREV_USER_TWEET_ENGAGEMENT = new Continuous(
|
||||
"itl.hydrated_tweet_features.prev_user_tweet_enagagement",
|
||||
Set(EngagementScore, EngagementsPrivate, EngagementsPublic).asJava
|
||||
)
|
||||
val IS_SENSITIVE = new Binary("itl.hydrated_tweet_features.is_sensitive")
|
||||
val HAS_MULTIPLE_MEDIA = new Binary(
|
||||
"itl.hydrated_tweet_features.has_multiple_media",
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_MULTIPLE_HASHTAGS_OR_TRENDS = new Binary(
|
||||
"itl.hydrated_tweet_features.has_multiple_hashtag_or_trend",
|
||||
Set(
|
||||
UserVisibleFlag,
|
||||
CountOfPrivateTweetEntitiesAndMetadata,
|
||||
CountOfPublicTweetEntitiesAndMetadata).asJava)
|
||||
val IS_AUTHOR_PROFILE_EGG =
|
||||
new Binary("itl.hydrated_tweet_features.is_author_profile_egg", Set(ProfileImage).asJava)
|
||||
val IS_AUTHOR_NEW =
|
||||
new Binary("itl.hydrated_tweet_features.is_author_new", Set(UserType, UserState).asJava)
|
||||
val NUM_MENTIONS = new Continuous(
|
||||
"itl.hydrated_tweet_features.num_mentions",
|
||||
Set(
|
||||
UserVisibleFlag,
|
||||
CountOfPrivateTweetEntitiesAndMetadata,
|
||||
CountOfPublicTweetEntitiesAndMetadata).asJava)
|
||||
val NUM_HASHTAGS = new Continuous(
|
||||
"itl.hydrated_tweet_features.num_hashtags",
|
||||
Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava)
|
||||
val LANGUAGE = new Discrete(
|
||||
"itl.hydrated_tweet_features.language",
|
||||
Set(ProvidedLanguage, InferredLanguage).asJava)
|
||||
val LINK_LANGUAGE = new Continuous(
|
||||
"itl.hydrated_tweet_features.link_language",
|
||||
Set(ProvidedLanguage, InferredLanguage).asJava)
|
||||
val IS_AUTHOR_NSFW =
|
||||
new Binary("itl.hydrated_tweet_features.is_author_nsfw", Set(UserType).asJava)
|
||||
val IS_AUTHOR_SPAM =
|
||||
new Binary("itl.hydrated_tweet_features.is_author_spam", Set(UserType).asJava)
|
||||
val IS_AUTHOR_BOT = new Binary("itl.hydrated_tweet_features.is_author_bot", Set(UserType).asJava)
|
||||
val IS_OFFENSIVE = new Binary("itl.hydrated_tweet_features.is_offensive")
|
||||
val FROM_VERIFIED_ACCOUNT =
|
||||
new Binary("itl.hydrated_tweet_features.from_verified_account", Set(UserVerifiedFlag).asJava)
|
||||
val EMBEDS_IMPRESSION_COUNT = new Continuous(
|
||||
"itl.hydrated_tweet_features.embeds_impression_count",
|
||||
Set(CountOfImpression).asJava)
|
||||
val EMBEDS_URL_COUNT =
|
||||
new Continuous("itl.hydrated_tweet_features.embeds_url_count", Set(UrlFoundFlag).asJava)
|
||||
val FAV_COUNT_V2 = new Continuous(
|
||||
"recap.earlybird.fav_count_v2",
|
||||
Set(CountOfPrivateLikes, CountOfPublicLikes).asJava)
|
||||
val RETWEET_COUNT_V2 = new Continuous(
|
||||
"recap.earlybird.retweet_count_v2",
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava)
|
||||
val REPLY_COUNT_V2 = new Continuous(
|
||||
"recap.earlybird.reply_count_v2",
|
||||
Set(CountOfPrivateReplies, CountOfPublicReplies).asJava)
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,24 @@
|
|||
package com.twitter.timelines.prediction.features.list_features
|
||||
|
||||
import com.twitter.ml.api.Feature.{Binary, Discrete}
|
||||
import com.twitter.ml.api.FeatureContext
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType._
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object ListFeatures {
|
||||
|
||||
// list.id is used for list tweet injections in home. timelines.meta.list_id is used for list tweets in list timeline.
|
||||
val LIST_ID = new Discrete("list.id")
|
||||
|
||||
val VIEWER_IS_OWNER =
|
||||
new Binary("list.viewer.is_owner", Set(ListsNonpublicList, ListsPublicList).asJava)
|
||||
val VIEWER_IS_SUBSCRIBER = new Binary("list.viewer.is_subscriber")
|
||||
val IS_PINNED_LIST = new Binary("list.is_pinned")
|
||||
|
||||
val featureContext = new FeatureContext(
|
||||
LIST_ID,
|
||||
VIEWER_IS_OWNER,
|
||||
VIEWER_IS_SUBSCRIBER,
|
||||
IS_PINNED_LIST
|
||||
)
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,49 @@
|
|||
package com.twitter.timelines.prediction.features.p_home_latest
|
||||
|
||||
import com.twitter.ml.api.Feature.{Continuous, Discrete}
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType._
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object HomeLatestUserFeatures {
|
||||
val LAST_LOGIN_TIMESTAMP_MS =
|
||||
new Discrete("home_latest.user_feature.last_login_timestamp_ms", Set(PrivateTimestamp).asJava)
|
||||
}
|
||||
|
||||
object HomeLatestUserAggregatesFeatures {
|
||||
|
||||
/**
|
||||
* Used as `timestampFeature` in `OfflineAggregateSource` required by feature aggregations, set to
|
||||
* the `dateRange` end timestamp by default
|
||||
*/
|
||||
val AGGREGATE_TIMESTAMP_MS =
|
||||
new Discrete("home_latest.user_feature.aggregate_timestamp_ms", Set(PrivateTimestamp).asJava)
|
||||
val HOME_TOP_IMPRESSIONS =
|
||||
new Continuous("home_latest.user_feature.home_top_impressions", Set(CountOfImpression).asJava)
|
||||
val HOME_LATEST_IMPRESSIONS =
|
||||
new Continuous(
|
||||
"home_latest.user_feature.home_latest_impressions",
|
||||
Set(CountOfImpression).asJava)
|
||||
val HOME_TOP_LAST_LOGIN_TIMESTAMP_MS =
|
||||
new Discrete(
|
||||
"home_latest.user_feature.home_top_last_login_timestamp_ms",
|
||||
Set(PrivateTimestamp).asJava)
|
||||
val HOME_LATEST_LAST_LOGIN_TIMESTAMP_MS =
|
||||
new Discrete(
|
||||
"home_latest.user_feature.home_latest_last_login_timestamp_ms",
|
||||
Set(PrivateTimestamp).asJava)
|
||||
val HOME_LATEST_MOST_RECENT_CLICK_TIMESTAMP_MS =
|
||||
new Discrete(
|
||||
"home_latest.user_feature.home_latest_most_recent_click_timestamp_ms",
|
||||
Set(PrivateTimestamp).asJava)
|
||||
}
|
||||
|
||||
case class HomeLatestUserFeatures(userId: Long, lastLoginTimestampMs: Long)
|
||||
|
||||
case class HomeLatestUserAggregatesFeatures(
|
||||
userId: Long,
|
||||
aggregateTimestampMs: Long,
|
||||
homeTopImpressions: Option[Double],
|
||||
homeLatestImpressions: Option[Double],
|
||||
homeTopLastLoginTimestampMs: Option[Long],
|
||||
homeLatestLastLoginTimestampMs: Option[Long],
|
||||
homeLatestMostRecentClickTimestampMs: Option[Long])
|
|
@ -0,0 +1,8 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,7 @@
|
|||
package com.twitter.timelines.prediction.features.ppmi
|
||||
|
||||
import com.twitter.ml.api.Feature.Continuous
|
||||
|
||||
object PpmiDataRecordFeatures {
|
||||
val PPMI_SCORE = new Continuous("ppmi.source_author.score")
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/scala/com/twitter/ml/featurestore/catalog/entities/core",
|
||||
"src/scala/com/twitter/ml/featurestore/catalog/entities/timelines",
|
||||
"src/scala/com/twitter/ml/featurestore/catalog/features/timelines:realgraph",
|
||||
"src/scala/com/twitter/ml/featurestore/lib/entity",
|
||||
"src/scala/com/twitter/ml/featurestore/lib/feature",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
"src/thrift/com/twitter/timelines/real_graph:real_graph-scala",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,232 @@
|
|||
package com.twitter.timelines.prediction.features.real_graph
|
||||
|
||||
import com.twitter.ml.featurestore.catalog.entities.core.UserAuthor
|
||||
import com.twitter.ml.featurestore.catalog.features.timelines.RealGraph
|
||||
import com.twitter.ml.featurestore.lib.EdgeEntityId
|
||||
import com.twitter.ml.featurestore.lib.UserId
|
||||
import com.twitter.ml.featurestore.lib.feature.BoundFeatureSet
|
||||
import com.twitter.ml.featurestore.lib.feature.Feature
|
||||
import com.twitter.ml.featurestore.lib.feature.FeatureSet
|
||||
|
||||
object RealGraphDataRecordFeatureStoreFeatures {
|
||||
val boundUserAuthorfeatureSet: BoundFeatureSet = FeatureSet(
|
||||
RealGraph.DestId,
|
||||
RealGraph.AddressBookEmail.DaysSinceLast,
|
||||
RealGraph.AddressBookEmail.ElapsedDays,
|
||||
RealGraph.AddressBookEmail.Ewma,
|
||||
RealGraph.AddressBookEmail.IsMissing,
|
||||
RealGraph.AddressBookEmail.Mean,
|
||||
RealGraph.AddressBookEmail.NonZeroDays,
|
||||
RealGraph.AddressBookEmail.Variance,
|
||||
RealGraph.AddressBookInBoth.DaysSinceLast,
|
||||
RealGraph.AddressBookInBoth.ElapsedDays,
|
||||
RealGraph.AddressBookInBoth.Ewma,
|
||||
RealGraph.AddressBookInBoth.IsMissing,
|
||||
RealGraph.AddressBookInBoth.Mean,
|
||||
RealGraph.AddressBookInBoth.NonZeroDays,
|
||||
RealGraph.AddressBookInBoth.Variance,
|
||||
RealGraph.AddressBookMutualEdgeEmail.DaysSinceLast,
|
||||
RealGraph.AddressBookMutualEdgeEmail.ElapsedDays,
|
||||
RealGraph.AddressBookMutualEdgeEmail.Ewma,
|
||||
RealGraph.AddressBookMutualEdgeEmail.IsMissing,
|
||||
RealGraph.AddressBookMutualEdgeEmail.Mean,
|
||||
RealGraph.AddressBookMutualEdgeEmail.NonZeroDays,
|
||||
RealGraph.AddressBookMutualEdgeEmail.Variance,
|
||||
RealGraph.AddressBookMutualEdgeInBoth.DaysSinceLast,
|
||||
RealGraph.AddressBookMutualEdgeInBoth.ElapsedDays,
|
||||
RealGraph.AddressBookMutualEdgeInBoth.Ewma,
|
||||
RealGraph.AddressBookMutualEdgeInBoth.IsMissing,
|
||||
RealGraph.AddressBookMutualEdgeInBoth.Mean,
|
||||
RealGraph.AddressBookMutualEdgeInBoth.NonZeroDays,
|
||||
RealGraph.AddressBookMutualEdgeInBoth.Variance,
|
||||
RealGraph.AddressBookMutualEdgePhone.DaysSinceLast,
|
||||
RealGraph.AddressBookMutualEdgePhone.ElapsedDays,
|
||||
RealGraph.AddressBookMutualEdgePhone.Ewma,
|
||||
RealGraph.AddressBookMutualEdgePhone.IsMissing,
|
||||
RealGraph.AddressBookMutualEdgePhone.Mean,
|
||||
RealGraph.AddressBookMutualEdgePhone.NonZeroDays,
|
||||
RealGraph.AddressBookMutualEdgePhone.Variance,
|
||||
RealGraph.AddressBookPhone.DaysSinceLast,
|
||||
RealGraph.AddressBookPhone.ElapsedDays,
|
||||
RealGraph.AddressBookPhone.Ewma,
|
||||
RealGraph.AddressBookPhone.IsMissing,
|
||||
RealGraph.AddressBookPhone.Mean,
|
||||
RealGraph.AddressBookPhone.NonZeroDays,
|
||||
RealGraph.AddressBookPhone.Variance,
|
||||
RealGraph.DirectMessages.DaysSinceLast,
|
||||
RealGraph.DirectMessages.ElapsedDays,
|
||||
RealGraph.DirectMessages.Ewma,
|
||||
RealGraph.DirectMessages.IsMissing,
|
||||
RealGraph.DirectMessages.Mean,
|
||||
RealGraph.DirectMessages.NonZeroDays,
|
||||
RealGraph.DirectMessages.Variance,
|
||||
RealGraph.DwellTime.DaysSinceLast,
|
||||
RealGraph.DwellTime.ElapsedDays,
|
||||
RealGraph.DwellTime.Ewma,
|
||||
RealGraph.DwellTime.IsMissing,
|
||||
RealGraph.DwellTime.Mean,
|
||||
RealGraph.DwellTime.NonZeroDays,
|
||||
RealGraph.DwellTime.Variance,
|
||||
RealGraph.Follow.DaysSinceLast,
|
||||
RealGraph.Follow.ElapsedDays,
|
||||
RealGraph.Follow.Ewma,
|
||||
RealGraph.Follow.IsMissing,
|
||||
RealGraph.Follow.Mean,
|
||||
RealGraph.Follow.NonZeroDays,
|
||||
RealGraph.Follow.Variance,
|
||||
RealGraph.InspectedStatuses.DaysSinceLast,
|
||||
RealGraph.InspectedStatuses.ElapsedDays,
|
||||
RealGraph.InspectedStatuses.Ewma,
|
||||
RealGraph.InspectedStatuses.IsMissing,
|
||||
RealGraph.InspectedStatuses.Mean,
|
||||
RealGraph.InspectedStatuses.NonZeroDays,
|
||||
RealGraph.InspectedStatuses.Variance,
|
||||
RealGraph.Likes.DaysSinceLast,
|
||||
RealGraph.Likes.ElapsedDays,
|
||||
RealGraph.Likes.Ewma,
|
||||
RealGraph.Likes.IsMissing,
|
||||
RealGraph.Likes.Mean,
|
||||
RealGraph.Likes.NonZeroDays,
|
||||
RealGraph.Likes.Variance,
|
||||
RealGraph.LinkClicks.DaysSinceLast,
|
||||
RealGraph.LinkClicks.ElapsedDays,
|
||||
RealGraph.LinkClicks.Ewma,
|
||||
RealGraph.LinkClicks.IsMissing,
|
||||
RealGraph.LinkClicks.Mean,
|
||||
RealGraph.LinkClicks.NonZeroDays,
|
||||
RealGraph.LinkClicks.Variance,
|
||||
RealGraph.Mentions.DaysSinceLast,
|
||||
RealGraph.Mentions.ElapsedDays,
|
||||
RealGraph.Mentions.Ewma,
|
||||
RealGraph.Mentions.IsMissing,
|
||||
RealGraph.Mentions.Mean,
|
||||
RealGraph.Mentions.NonZeroDays,
|
||||
RealGraph.Mentions.Variance,
|
||||
RealGraph.MutualFollow.DaysSinceLast,
|
||||
RealGraph.MutualFollow.ElapsedDays,
|
||||
RealGraph.MutualFollow.Ewma,
|
||||
RealGraph.MutualFollow.IsMissing,
|
||||
RealGraph.MutualFollow.Mean,
|
||||
RealGraph.MutualFollow.NonZeroDays,
|
||||
RealGraph.MutualFollow.Variance,
|
||||
RealGraph.NumTweetQuotes.DaysSinceLast,
|
||||
RealGraph.NumTweetQuotes.ElapsedDays,
|
||||
RealGraph.NumTweetQuotes.Ewma,
|
||||
RealGraph.NumTweetQuotes.IsMissing,
|
||||
RealGraph.NumTweetQuotes.Mean,
|
||||
RealGraph.NumTweetQuotes.NonZeroDays,
|
||||
RealGraph.NumTweetQuotes.Variance,
|
||||
RealGraph.PhotoTags.DaysSinceLast,
|
||||
RealGraph.PhotoTags.ElapsedDays,
|
||||
RealGraph.PhotoTags.Ewma,
|
||||
RealGraph.PhotoTags.IsMissing,
|
||||
RealGraph.PhotoTags.Mean,
|
||||
RealGraph.PhotoTags.NonZeroDays,
|
||||
RealGraph.PhotoTags.Variance,
|
||||
RealGraph.ProfileViews.DaysSinceLast,
|
||||
RealGraph.ProfileViews.ElapsedDays,
|
||||
RealGraph.ProfileViews.Ewma,
|
||||
RealGraph.ProfileViews.IsMissing,
|
||||
RealGraph.ProfileViews.Mean,
|
||||
RealGraph.ProfileViews.NonZeroDays,
|
||||
RealGraph.ProfileViews.Variance,
|
||||
RealGraph.Retweets.DaysSinceLast,
|
||||
RealGraph.Retweets.ElapsedDays,
|
||||
RealGraph.Retweets.Ewma,
|
||||
RealGraph.Retweets.IsMissing,
|
||||
RealGraph.Retweets.Mean,
|
||||
RealGraph.Retweets.NonZeroDays,
|
||||
RealGraph.Retweets.Variance,
|
||||
RealGraph.SmsFollow.DaysSinceLast,
|
||||
RealGraph.SmsFollow.ElapsedDays,
|
||||
RealGraph.SmsFollow.Ewma,
|
||||
RealGraph.SmsFollow.IsMissing,
|
||||
RealGraph.SmsFollow.Mean,
|
||||
RealGraph.SmsFollow.NonZeroDays,
|
||||
RealGraph.SmsFollow.Variance,
|
||||
RealGraph.TweetClicks.DaysSinceLast,
|
||||
RealGraph.TweetClicks.ElapsedDays,
|
||||
RealGraph.TweetClicks.Ewma,
|
||||
RealGraph.TweetClicks.IsMissing,
|
||||
RealGraph.TweetClicks.Mean,
|
||||
RealGraph.TweetClicks.NonZeroDays,
|
||||
RealGraph.TweetClicks.Variance,
|
||||
RealGraph.Weight
|
||||
).bind(UserAuthor)
|
||||
|
||||
private[this] val edgeFeatures: Seq[RealGraph.EdgeFeature] = Seq(
|
||||
RealGraph.AddressBookEmail,
|
||||
RealGraph.AddressBookInBoth,
|
||||
RealGraph.AddressBookMutualEdgeEmail,
|
||||
RealGraph.AddressBookMutualEdgeInBoth,
|
||||
RealGraph.AddressBookMutualEdgePhone,
|
||||
RealGraph.AddressBookPhone,
|
||||
RealGraph.DirectMessages,
|
||||
RealGraph.DwellTime,
|
||||
RealGraph.Follow,
|
||||
RealGraph.InspectedStatuses,
|
||||
RealGraph.Likes,
|
||||
RealGraph.LinkClicks,
|
||||
RealGraph.Mentions,
|
||||
RealGraph.MutualFollow,
|
||||
RealGraph.PhotoTags,
|
||||
RealGraph.ProfileViews,
|
||||
RealGraph.Retweets,
|
||||
RealGraph.SmsFollow,
|
||||
RealGraph.TweetClicks
|
||||
)
|
||||
|
||||
val htlDoubleFeatures: Set[Feature[EdgeEntityId[UserId, UserId], Double]] = {
|
||||
val features = edgeFeatures.flatMap { ef =>
|
||||
Seq(ef.Ewma, ef.Mean, ef.Variance)
|
||||
} ++ Seq(RealGraph.Weight)
|
||||
features.toSet
|
||||
}
|
||||
|
||||
val htlLongFeatures: Set[Feature[EdgeEntityId[UserId, UserId], Long]] = {
|
||||
val features = edgeFeatures.flatMap { ef =>
|
||||
Seq(ef.DaysSinceLast, ef.ElapsedDays, ef.NonZeroDays)
|
||||
}
|
||||
features.toSet
|
||||
}
|
||||
|
||||
private val edgeFeatureToLegacyName = Map(
|
||||
RealGraph.AddressBookEmail -> "num_address_book_email",
|
||||
RealGraph.AddressBookInBoth -> "num_address_book_in_both",
|
||||
RealGraph.AddressBookMutualEdgeEmail -> "num_address_book_mutual_edge_email",
|
||||
RealGraph.AddressBookMutualEdgeInBoth -> "num_address_book_mutual_edge_in_both",
|
||||
RealGraph.AddressBookMutualEdgePhone -> "num_address_book_mutual_edge_phone",
|
||||
RealGraph.AddressBookPhone -> "num_address_book_phone",
|
||||
RealGraph.DirectMessages -> "direct_messages",
|
||||
RealGraph.DwellTime -> "total_dwell_time",
|
||||
RealGraph.Follow -> "num_follow",
|
||||
RealGraph.InspectedStatuses -> "num_inspected_tweets",
|
||||
RealGraph.Likes -> "num_favorites",
|
||||
RealGraph.LinkClicks -> "num_link_clicks",
|
||||
RealGraph.Mentions -> "num_mentions",
|
||||
RealGraph.MutualFollow -> "num_mutual_follow",
|
||||
RealGraph.PhotoTags -> "num_photo_tags",
|
||||
RealGraph.ProfileViews -> "num_profile_views",
|
||||
RealGraph.Retweets -> "num_retweets",
|
||||
RealGraph.SmsFollow -> "num_sms_follow",
|
||||
RealGraph.TweetClicks -> "num_tweet_clicks",
|
||||
)
|
||||
|
||||
def convertFeatureToLegacyName(
|
||||
prefix: String,
|
||||
variance: String = "variance"
|
||||
): Map[Feature[EdgeEntityId[UserId, UserId], _ >: Long with Double <: AnyVal], String] =
|
||||
edgeFeatureToLegacyName.flatMap {
|
||||
case (k, v) =>
|
||||
Seq(
|
||||
k.NonZeroDays -> s"${prefix}.${v}.non_zero_days",
|
||||
k.DaysSinceLast -> s"${prefix}.${v}.days_since_last",
|
||||
k.ElapsedDays -> s"${prefix}.${v}.elapsed_days",
|
||||
k.Ewma -> s"${prefix}.${v}.ewma",
|
||||
k.Mean -> s"${prefix}.${v}.mean",
|
||||
k.Variance -> s"${prefix}.${v}.${variance}",
|
||||
)
|
||||
} ++ Map(
|
||||
RealGraph.Weight -> (prefix + ".weight")
|
||||
)
|
||||
}
|
|
@ -0,0 +1,534 @@
|
|||
package com.twitter.timelines.prediction.features.real_graph
|
||||
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType._
|
||||
import com.twitter.ml.api.Feature._
|
||||
import com.twitter.timelines.real_graph.v1.thriftscala.RealGraphEdgeFeature
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
|
||||
object RealGraphDataRecordFeatures {
|
||||
// the source user id
|
||||
val SRC_ID = new Discrete("realgraph.src_id", Set(UserId).asJava)
|
||||
// the destination user id
|
||||
val DST_ID = new Discrete("realgraph.dst_id", Set(UserId).asJava)
|
||||
// real graph weight
|
||||
val WEIGHT = new Continuous("realgraph.weight", Set(UsersRealGraphScore).asJava)
|
||||
// the number of retweets that the source user sent to the destination user
|
||||
val NUM_RETWEETS_MEAN =
|
||||
new Continuous("realgraph.num_retweets.mean", Set(PrivateRetweets, PublicRetweets).asJava)
|
||||
val NUM_RETWEETS_EWMA =
|
||||
new Continuous("realgraph.num_retweets.ewma", Set(PrivateRetweets, PublicRetweets).asJava)
|
||||
val NUM_RETWEETS_VARIANCE =
|
||||
new Continuous("realgraph.num_retweets.variance", Set(PrivateRetweets, PublicRetweets).asJava)
|
||||
val NUM_RETWEETS_NON_ZERO_DAYS = new Continuous(
|
||||
"realgraph.num_retweets.non_zero_days",
|
||||
Set(PrivateRetweets, PublicRetweets).asJava)
|
||||
val NUM_RETWEETS_ELAPSED_DAYS = new Continuous(
|
||||
"realgraph.num_retweets.elapsed_days",
|
||||
Set(PrivateRetweets, PublicRetweets).asJava)
|
||||
val NUM_RETWEETS_DAYS_SINCE_LAST = new Continuous(
|
||||
"realgraph.num_retweets.days_since_last",
|
||||
Set(PrivateRetweets, PublicRetweets).asJava)
|
||||
val NUM_RETWEETS_IS_MISSING =
|
||||
new Binary("realgraph.num_retweets.is_missing", Set(PrivateRetweets, PublicRetweets).asJava)
|
||||
// the number of favories that the source user sent to the destination user
|
||||
val NUM_FAVORITES_MEAN =
|
||||
new Continuous("realgraph.num_favorites.mean", Set(PublicLikes, PrivateLikes).asJava)
|
||||
val NUM_FAVORITES_EWMA =
|
||||
new Continuous("realgraph.num_favorites.ewma", Set(PublicLikes, PrivateLikes).asJava)
|
||||
val NUM_FAVORITES_VARIANCE =
|
||||
new Continuous("realgraph.num_favorites.variance", Set(PublicLikes, PrivateLikes).asJava)
|
||||
val NUM_FAVORITES_NON_ZERO_DAYS =
|
||||
new Continuous("realgraph.num_favorites.non_zero_days", Set(PublicLikes, PrivateLikes).asJava)
|
||||
val NUM_FAVORITES_ELAPSED_DAYS =
|
||||
new Continuous("realgraph.num_favorites.elapsed_days", Set(PublicLikes, PrivateLikes).asJava)
|
||||
val NUM_FAVORITES_DAYS_SINCE_LAST =
|
||||
new Continuous("realgraph.num_favorites.days_since_last", Set(PublicLikes, PrivateLikes).asJava)
|
||||
val NUM_FAVORITES_IS_MISSING =
|
||||
new Binary("realgraph.num_favorites.is_missing", Set(PublicLikes, PrivateLikes).asJava)
|
||||
// the number of mentions that the source user sent to the destination user
|
||||
val NUM_MENTIONS_MEAN =
|
||||
new Continuous("realgraph.num_mentions.mean", Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val NUM_MENTIONS_EWMA =
|
||||
new Continuous("realgraph.num_mentions.ewma", Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val NUM_MENTIONS_VARIANCE = new Continuous(
|
||||
"realgraph.num_mentions.variance",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val NUM_MENTIONS_NON_ZERO_DAYS = new Continuous(
|
||||
"realgraph.num_mentions.non_zero_days",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val NUM_MENTIONS_ELAPSED_DAYS = new Continuous(
|
||||
"realgraph.num_mentions.elapsed_days",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val NUM_MENTIONS_DAYS_SINCE_LAST = new Continuous(
|
||||
"realgraph.num_mentions.days_since_last",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val NUM_MENTIONS_IS_MISSING = new Binary(
|
||||
"realgraph.num_mentions.is_missing",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
// the number of direct messages that the source user sent to the destination user
|
||||
val NUM_DIRECT_MESSAGES_MEAN = new Continuous(
|
||||
"realgraph.num_direct_messages.mean",
|
||||
Set(DmEntitiesAndMetadata, CountOfDms).asJava)
|
||||
val NUM_DIRECT_MESSAGES_EWMA = new Continuous(
|
||||
"realgraph.num_direct_messages.ewma",
|
||||
Set(DmEntitiesAndMetadata, CountOfDms).asJava)
|
||||
val NUM_DIRECT_MESSAGES_VARIANCE = new Continuous(
|
||||
"realgraph.num_direct_messages.variance",
|
||||
Set(DmEntitiesAndMetadata, CountOfDms).asJava)
|
||||
val NUM_DIRECT_MESSAGES_NON_ZERO_DAYS = new Continuous(
|
||||
"realgraph.num_direct_messages.non_zero_days",
|
||||
Set(DmEntitiesAndMetadata, CountOfDms).asJava
|
||||
)
|
||||
val NUM_DIRECT_MESSAGES_ELAPSED_DAYS = new Continuous(
|
||||
"realgraph.num_direct_messages.elapsed_days",
|
||||
Set(DmEntitiesAndMetadata, CountOfDms).asJava
|
||||
)
|
||||
val NUM_DIRECT_MESSAGES_DAYS_SINCE_LAST = new Continuous(
|
||||
"realgraph.num_direct_messages.days_since_last",
|
||||
Set(DmEntitiesAndMetadata, CountOfDms).asJava
|
||||
)
|
||||
val NUM_DIRECT_MESSAGES_IS_MISSING = new Binary(
|
||||
"realgraph.num_direct_messages.is_missing",
|
||||
Set(DmEntitiesAndMetadata, CountOfDms).asJava)
|
||||
// the number of tweet clicks that the source user sent to the destination user
|
||||
val NUM_TWEET_CLICKS_MEAN =
|
||||
new Continuous("realgraph.num_tweet_clicks.mean", Set(TweetsClicked).asJava)
|
||||
val NUM_TWEET_CLICKS_EWMA =
|
||||
new Continuous("realgraph.num_tweet_clicks.ewma", Set(TweetsClicked).asJava)
|
||||
val NUM_TWEET_CLICKS_VARIANCE =
|
||||
new Continuous("realgraph.num_tweet_clicks.variance", Set(TweetsClicked).asJava)
|
||||
val NUM_TWEET_CLICKS_NON_ZERO_DAYS =
|
||||
new Continuous("realgraph.num_tweet_clicks.non_zero_days", Set(TweetsClicked).asJava)
|
||||
val NUM_TWEET_CLICKS_ELAPSED_DAYS =
|
||||
new Continuous("realgraph.num_tweet_clicks.elapsed_days", Set(TweetsClicked).asJava)
|
||||
val NUM_TWEET_CLICKS_DAYS_SINCE_LAST = new Continuous(
|
||||
"realgraph.num_tweet_clicks.days_since_last",
|
||||
Set(TweetsClicked).asJava
|
||||
)
|
||||
val NUM_TWEET_CLICKS_IS_MISSING =
|
||||
new Binary("realgraph.num_tweet_clicks.is_missing", Set(TweetsClicked).asJava)
|
||||
// the number of link clicks that the source user sent to the destination user
|
||||
val NUM_LINK_CLICKS_MEAN =
|
||||
new Continuous("realgraph.num_link_clicks.mean", Set(CountOfTweetEntitiesClicked).asJava)
|
||||
val NUM_LINK_CLICKS_EWMA =
|
||||
new Continuous("realgraph.num_link_clicks.ewma", Set(CountOfTweetEntitiesClicked).asJava)
|
||||
val NUM_LINK_CLICKS_VARIANCE =
|
||||
new Continuous("realgraph.num_link_clicks.variance", Set(CountOfTweetEntitiesClicked).asJava)
|
||||
val NUM_LINK_CLICKS_NON_ZERO_DAYS = new Continuous(
|
||||
"realgraph.num_link_clicks.non_zero_days",
|
||||
Set(CountOfTweetEntitiesClicked).asJava)
|
||||
val NUM_LINK_CLICKS_ELAPSED_DAYS = new Continuous(
|
||||
"realgraph.num_link_clicks.elapsed_days",
|
||||
Set(CountOfTweetEntitiesClicked).asJava)
|
||||
val NUM_LINK_CLICKS_DAYS_SINCE_LAST = new Continuous(
|
||||
"realgraph.num_link_clicks.days_since_last",
|
||||
Set(CountOfTweetEntitiesClicked).asJava)
|
||||
val NUM_LINK_CLICKS_IS_MISSING =
|
||||
new Binary("realgraph.num_link_clicks.is_missing", Set(CountOfTweetEntitiesClicked).asJava)
|
||||
// the number of profile views that the source user sent to the destination user
|
||||
val NUM_PROFILE_VIEWS_MEAN =
|
||||
new Continuous("realgraph.num_profile_views.mean", Set(ProfilesViewed).asJava)
|
||||
val NUM_PROFILE_VIEWS_EWMA =
|
||||
new Continuous("realgraph.num_profile_views.ewma", Set(ProfilesViewed).asJava)
|
||||
val NUM_PROFILE_VIEWS_VARIANCE =
|
||||
new Continuous("realgraph.num_profile_views.variance", Set(ProfilesViewed).asJava)
|
||||
val NUM_PROFILE_VIEWS_NON_ZERO_DAYS =
|
||||
new Continuous("realgraph.num_profile_views.non_zero_days", Set(ProfilesViewed).asJava)
|
||||
val NUM_PROFILE_VIEWS_ELAPSED_DAYS =
|
||||
new Continuous("realgraph.num_profile_views.elapsed_days", Set(ProfilesViewed).asJava)
|
||||
val NUM_PROFILE_VIEWS_DAYS_SINCE_LAST = new Continuous(
|
||||
"realgraph.num_profile_views.days_since_last",
|
||||
Set(ProfilesViewed).asJava
|
||||
)
|
||||
val NUM_PROFILE_VIEWS_IS_MISSING =
|
||||
new Binary("realgraph.num_profile_views.is_missing", Set(ProfilesViewed).asJava)
|
||||
// the total dwell time the source user spends on the target user's tweets
|
||||
val TOTAL_DWELL_TIME_MEAN =
|
||||
new Continuous("realgraph.total_dwell_time.mean", Set(CountOfImpression).asJava)
|
||||
val TOTAL_DWELL_TIME_EWMA =
|
||||
new Continuous("realgraph.total_dwell_time.ewma", Set(CountOfImpression).asJava)
|
||||
val TOTAL_DWELL_TIME_VARIANCE =
|
||||
new Continuous("realgraph.total_dwell_time.variance", Set(CountOfImpression).asJava)
|
||||
val TOTAL_DWELL_TIME_NON_ZERO_DAYS =
|
||||
new Continuous("realgraph.total_dwell_time.non_zero_days", Set(CountOfImpression).asJava)
|
||||
val TOTAL_DWELL_TIME_ELAPSED_DAYS =
|
||||
new Continuous("realgraph.total_dwell_time.elapsed_days", Set(CountOfImpression).asJava)
|
||||
val TOTAL_DWELL_TIME_DAYS_SINCE_LAST = new Continuous(
|
||||
"realgraph.total_dwell_time.days_since_last",
|
||||
Set(CountOfImpression).asJava
|
||||
)
|
||||
val TOTAL_DWELL_TIME_IS_MISSING =
|
||||
new Binary("realgraph.total_dwell_time.is_missing", Set(CountOfImpression).asJava)
|
||||
// the number of the target user's tweets that the source user has inspected
|
||||
val NUM_INSPECTED_TWEETS_MEAN =
|
||||
new Continuous("realgraph.num_inspected_tweets.mean", Set(CountOfImpression).asJava)
|
||||
val NUM_INSPECTED_TWEETS_EWMA =
|
||||
new Continuous("realgraph.num_inspected_tweets.ewma", Set(CountOfImpression).asJava)
|
||||
val NUM_INSPECTED_TWEETS_VARIANCE =
|
||||
new Continuous("realgraph.num_inspected_tweets.variance", Set(CountOfImpression).asJava)
|
||||
val NUM_INSPECTED_TWEETS_NON_ZERO_DAYS = new Continuous(
|
||||
"realgraph.num_inspected_tweets.non_zero_days",
|
||||
Set(CountOfImpression).asJava
|
||||
)
|
||||
val NUM_INSPECTED_TWEETS_ELAPSED_DAYS = new Continuous(
|
||||
"realgraph.num_inspected_tweets.elapsed_days",
|
||||
Set(CountOfImpression).asJava
|
||||
)
|
||||
val NUM_INSPECTED_TWEETS_DAYS_SINCE_LAST = new Continuous(
|
||||
"realgraph.num_inspected_tweets.days_since_last",
|
||||
Set(CountOfImpression).asJava
|
||||
)
|
||||
val NUM_INSPECTED_TWEETS_IS_MISSING =
|
||||
new Binary("realgraph.num_inspected_tweets.is_missing", Set(CountOfImpression).asJava)
|
||||
// the number of photos in which the source user has tagged the target user
|
||||
val NUM_PHOTO_TAGS_MEAN = new Continuous(
|
||||
"realgraph.num_photo_tags.mean",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val NUM_PHOTO_TAGS_EWMA = new Continuous(
|
||||
"realgraph.num_photo_tags.ewma",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val NUM_PHOTO_TAGS_VARIANCE = new Continuous(
|
||||
"realgraph.num_photo_tags.variance",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val NUM_PHOTO_TAGS_NON_ZERO_DAYS = new Continuous(
|
||||
"realgraph.num_photo_tags.non_zero_days",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val NUM_PHOTO_TAGS_ELAPSED_DAYS = new Continuous(
|
||||
"realgraph.num_photo_tags.elapsed_days",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val NUM_PHOTO_TAGS_DAYS_SINCE_LAST = new Continuous(
|
||||
"realgraph.num_photo_tags.days_since_last",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val NUM_PHOTO_TAGS_IS_MISSING = new Binary(
|
||||
"realgraph.num_photo_tags.is_missing",
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
|
||||
val NUM_FOLLOW_MEAN = new Continuous(
|
||||
"realgraph.num_follow.mean",
|
||||
Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava)
|
||||
val NUM_FOLLOW_EWMA = new Continuous(
|
||||
"realgraph.num_follow.ewma",
|
||||
Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava)
|
||||
val NUM_FOLLOW_VARIANCE = new Continuous(
|
||||
"realgraph.num_follow.variance",
|
||||
Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava)
|
||||
val NUM_FOLLOW_NON_ZERO_DAYS = new Continuous(
|
||||
"realgraph.num_follow.non_zero_days",
|
||||
Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava)
|
||||
val NUM_FOLLOW_ELAPSED_DAYS = new Continuous(
|
||||
"realgraph.num_follow.elapsed_days",
|
||||
Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava)
|
||||
val NUM_FOLLOW_DAYS_SINCE_LAST = new Continuous(
|
||||
"realgraph.num_follow.days_since_last",
|
||||
Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava)
|
||||
val NUM_FOLLOW_IS_MISSING = new Binary(
|
||||
"realgraph.num_follow.is_missing",
|
||||
Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava)
|
||||
// the number of blocks that the source user sent to the destination user
|
||||
val NUM_BLOCKS_MEAN =
|
||||
new Continuous("realgraph.num_blocks.mean", Set(CountOfBlocks).asJava)
|
||||
val NUM_BLOCKS_EWMA =
|
||||
new Continuous("realgraph.num_blocks.ewma", Set(CountOfBlocks).asJava)
|
||||
val NUM_BLOCKS_VARIANCE =
|
||||
new Continuous("realgraph.num_blocks.variance", Set(CountOfBlocks).asJava)
|
||||
val NUM_BLOCKS_NON_ZERO_DAYS =
|
||||
new Continuous("realgraph.num_blocks.non_zero_days", Set(CountOfBlocks).asJava)
|
||||
val NUM_BLOCKS_ELAPSED_DAYS =
|
||||
new Continuous("realgraph.num_blocks.elapsed_days", Set(CountOfBlocks).asJava)
|
||||
val NUM_BLOCKS_DAYS_SINCE_LAST =
|
||||
new Continuous("realgraph.num_blocks.days_since_last", Set(CountOfBlocks).asJava)
|
||||
val NUM_BLOCKS_IS_MISSING =
|
||||
new Binary("realgraph.num_blocks.is_missing", Set(CountOfBlocks).asJava)
|
||||
// the number of mutes that the source user sent to the destination user
|
||||
val NUM_MUTES_MEAN =
|
||||
new Continuous("realgraph.num_mutes.mean", Set(CountOfMutes).asJava)
|
||||
val NUM_MUTES_EWMA =
|
||||
new Continuous("realgraph.num_mutes.ewma", Set(CountOfMutes).asJava)
|
||||
val NUM_MUTES_VARIANCE =
|
||||
new Continuous("realgraph.num_mutes.variance", Set(CountOfMutes).asJava)
|
||||
val NUM_MUTES_NON_ZERO_DAYS =
|
||||
new Continuous("realgraph.num_mutes.non_zero_days", Set(CountOfMutes).asJava)
|
||||
val NUM_MUTES_ELAPSED_DAYS =
|
||||
new Continuous("realgraph.num_mutes.elapsed_days", Set(CountOfMutes).asJava)
|
||||
val NUM_MUTES_DAYS_SINCE_LAST =
|
||||
new Continuous("realgraph.num_mutes.days_since_last", Set(CountOfMutes).asJava)
|
||||
val NUM_MUTES_IS_MISSING =
|
||||
new Binary("realgraph.num_mutes.is_missing", Set(CountOfMutes).asJava)
|
||||
// the number of report as abuses that the source user sent to the destination user
|
||||
val NUM_REPORTS_AS_ABUSES_MEAN =
|
||||
new Continuous("realgraph.num_report_as_abuses.mean", Set(CountOfAbuseReports).asJava)
|
||||
val NUM_REPORTS_AS_ABUSES_EWMA =
|
||||
new Continuous("realgraph.num_report_as_abuses.ewma", Set(CountOfAbuseReports).asJava)
|
||||
val NUM_REPORTS_AS_ABUSES_VARIANCE =
|
||||
new Continuous("realgraph.num_report_as_abuses.variance", Set(CountOfAbuseReports).asJava)
|
||||
val NUM_REPORTS_AS_ABUSES_NON_ZERO_DAYS =
|
||||
new Continuous("realgraph.num_report_as_abuses.non_zero_days", Set(CountOfAbuseReports).asJava)
|
||||
val NUM_REPORTS_AS_ABUSES_ELAPSED_DAYS =
|
||||
new Continuous("realgraph.num_report_as_abuses.elapsed_days", Set(CountOfAbuseReports).asJava)
|
||||
val NUM_REPORTS_AS_ABUSES_DAYS_SINCE_LAST =
|
||||
new Continuous(
|
||||
"realgraph.num_report_as_abuses.days_since_last",
|
||||
Set(CountOfAbuseReports).asJava)
|
||||
val NUM_REPORTS_AS_ABUSES_IS_MISSING =
|
||||
new Binary("realgraph.num_report_as_abuses.is_missing", Set(CountOfAbuseReports).asJava)
|
||||
// the number of report as spams that the source user sent to the destination user
|
||||
val NUM_REPORTS_AS_SPAMS_MEAN =
|
||||
new Continuous(
|
||||
"realgraph.num_report_as_spams.mean",
|
||||
Set(CountOfAbuseReports, SafetyRelationships).asJava)
|
||||
val NUM_REPORTS_AS_SPAMS_EWMA =
|
||||
new Continuous(
|
||||
"realgraph.num_report_as_spams.ewma",
|
||||
Set(CountOfAbuseReports, SafetyRelationships).asJava)
|
||||
val NUM_REPORTS_AS_SPAMS_VARIANCE =
|
||||
new Continuous(
|
||||
"realgraph.num_report_as_spams.variance",
|
||||
Set(CountOfAbuseReports, SafetyRelationships).asJava)
|
||||
val NUM_REPORTS_AS_SPAMS_NON_ZERO_DAYS =
|
||||
new Continuous(
|
||||
"realgraph.num_report_as_spams.non_zero_days",
|
||||
Set(CountOfAbuseReports, SafetyRelationships).asJava)
|
||||
val NUM_REPORTS_AS_SPAMS_ELAPSED_DAYS =
|
||||
new Continuous(
|
||||
"realgraph.num_report_as_spams.elapsed_days",
|
||||
Set(CountOfAbuseReports, SafetyRelationships).asJava)
|
||||
val NUM_REPORTS_AS_SPAMS_DAYS_SINCE_LAST =
|
||||
new Continuous(
|
||||
"realgraph.num_report_as_spams.days_since_last",
|
||||
Set(CountOfAbuseReports, SafetyRelationships).asJava)
|
||||
val NUM_REPORTS_AS_SPAMS_IS_MISSING =
|
||||
new Binary(
|
||||
"realgraph.num_report_as_spams.is_missing",
|
||||
Set(CountOfAbuseReports, SafetyRelationships).asJava)
|
||||
|
||||
val NUM_MUTUAL_FOLLOW_MEAN = new Continuous(
|
||||
"realgraph.num_mutual_follow.mean",
|
||||
Set(
|
||||
Follow,
|
||||
PrivateAccountsFollowedBy,
|
||||
PublicAccountsFollowedBy,
|
||||
PrivateAccountsFollowing,
|
||||
PublicAccountsFollowing).asJava
|
||||
)
|
||||
val NUM_MUTUAL_FOLLOW_EWMA = new Continuous(
|
||||
"realgraph.num_mutual_follow.ewma",
|
||||
Set(
|
||||
Follow,
|
||||
PrivateAccountsFollowedBy,
|
||||
PublicAccountsFollowedBy,
|
||||
PrivateAccountsFollowing,
|
||||
PublicAccountsFollowing).asJava
|
||||
)
|
||||
val NUM_MUTUAL_FOLLOW_VARIANCE = new Continuous(
|
||||
"realgraph.num_mutual_follow.variance",
|
||||
Set(
|
||||
Follow,
|
||||
PrivateAccountsFollowedBy,
|
||||
PublicAccountsFollowedBy,
|
||||
PrivateAccountsFollowing,
|
||||
PublicAccountsFollowing).asJava
|
||||
)
|
||||
val NUM_MUTUAL_FOLLOW_NON_ZERO_DAYS = new Continuous(
|
||||
"realgraph.num_mutual_follow.non_zero_days",
|
||||
Set(
|
||||
Follow,
|
||||
PrivateAccountsFollowedBy,
|
||||
PublicAccountsFollowedBy,
|
||||
PrivateAccountsFollowing,
|
||||
PublicAccountsFollowing).asJava
|
||||
)
|
||||
val NUM_MUTUAL_FOLLOW_ELAPSED_DAYS = new Continuous(
|
||||
"realgraph.num_mutual_follow.elapsed_days",
|
||||
Set(
|
||||
Follow,
|
||||
PrivateAccountsFollowedBy,
|
||||
PublicAccountsFollowedBy,
|
||||
PrivateAccountsFollowing,
|
||||
PublicAccountsFollowing).asJava
|
||||
)
|
||||
val NUM_MUTUAL_FOLLOW_DAYS_SINCE_LAST = new Continuous(
|
||||
"realgraph.num_mutual_follow.days_since_last",
|
||||
Set(
|
||||
Follow,
|
||||
PrivateAccountsFollowedBy,
|
||||
PublicAccountsFollowedBy,
|
||||
PrivateAccountsFollowing,
|
||||
PublicAccountsFollowing).asJava
|
||||
)
|
||||
val NUM_MUTUAL_FOLLOW_IS_MISSING = new Binary(
|
||||
"realgraph.num_mutual_follow.is_missing",
|
||||
Set(
|
||||
Follow,
|
||||
PrivateAccountsFollowedBy,
|
||||
PublicAccountsFollowedBy,
|
||||
PrivateAccountsFollowing,
|
||||
PublicAccountsFollowing).asJava
|
||||
)
|
||||
|
||||
val NUM_SMS_FOLLOW_MEAN = new Continuous(
|
||||
"realgraph.num_sms_follow.mean",
|
||||
Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava)
|
||||
val NUM_SMS_FOLLOW_EWMA = new Continuous(
|
||||
"realgraph.num_sms_follow.ewma",
|
||||
Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava)
|
||||
val NUM_SMS_FOLLOW_VARIANCE = new Continuous(
|
||||
"realgraph.num_sms_follow.variance",
|
||||
Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava)
|
||||
val NUM_SMS_FOLLOW_NON_ZERO_DAYS = new Continuous(
|
||||
"realgraph.num_sms_follow.non_zero_days",
|
||||
Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava)
|
||||
val NUM_SMS_FOLLOW_ELAPSED_DAYS = new Continuous(
|
||||
"realgraph.num_sms_follow.elapsed_days",
|
||||
Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava)
|
||||
val NUM_SMS_FOLLOW_DAYS_SINCE_LAST = new Continuous(
|
||||
"realgraph.num_sms_follow.days_since_last",
|
||||
Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava)
|
||||
val NUM_SMS_FOLLOW_IS_MISSING = new Binary(
|
||||
"realgraph.num_sms_follow.is_missing",
|
||||
Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava)
|
||||
|
||||
val NUM_ADDRESS_BOOK_EMAIL_MEAN =
|
||||
new Continuous("realgraph.num_address_book_email.mean", Set(AddressBook).asJava)
|
||||
val NUM_ADDRESS_BOOK_EMAIL_EWMA =
|
||||
new Continuous("realgraph.num_address_book_email.ewma", Set(AddressBook).asJava)
|
||||
val NUM_ADDRESS_BOOK_EMAIL_VARIANCE =
|
||||
new Continuous("realgraph.num_address_book_email.variance", Set(AddressBook).asJava)
|
||||
val NUM_ADDRESS_BOOK_EMAIL_NON_ZERO_DAYS = new Continuous(
|
||||
"realgraph.num_address_book_email.non_zero_days",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
val NUM_ADDRESS_BOOK_EMAIL_ELAPSED_DAYS = new Continuous(
|
||||
"realgraph.num_address_book_email.elapsed_days",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
val NUM_ADDRESS_BOOK_EMAIL_DAYS_SINCE_LAST = new Continuous(
|
||||
"realgraph.num_address_book_email.days_since_last",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
val NUM_ADDRESS_BOOK_EMAIL_IS_MISSING =
|
||||
new Binary("realgraph.num_address_book_email.is_missing", Set(AddressBook).asJava)
|
||||
|
||||
val NUM_ADDRESS_BOOK_IN_BOTH_MEAN =
|
||||
new Continuous("realgraph.num_address_book_in_both.mean", Set(AddressBook).asJava)
|
||||
val NUM_ADDRESS_BOOK_IN_BOTH_EWMA =
|
||||
new Continuous("realgraph.num_address_book_in_both.ewma", Set(AddressBook).asJava)
|
||||
val NUM_ADDRESS_BOOK_IN_BOTH_VARIANCE = new Continuous(
|
||||
"realgraph.num_address_book_in_both.variance",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
val NUM_ADDRESS_BOOK_IN_BOTH_NON_ZERO_DAYS = new Continuous(
|
||||
"realgraph.num_address_book_in_both.non_zero_days",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
val NUM_ADDRESS_BOOK_IN_BOTH_ELAPSED_DAYS = new Continuous(
|
||||
"realgraph.num_address_book_in_both.elapsed_days",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
val NUM_ADDRESS_BOOK_IN_BOTH_DAYS_SINCE_LAST = new Continuous(
|
||||
"realgraph.num_address_book_in_both.days_since_last",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
val NUM_ADDRESS_BOOK_IN_BOTH_IS_MISSING = new Binary(
|
||||
"realgraph.num_address_book_in_both.is_missing",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
|
||||
val NUM_ADDRESS_BOOK_PHONE_MEAN =
|
||||
new Continuous("realgraph.num_address_book_phone.mean", Set(AddressBook).asJava)
|
||||
val NUM_ADDRESS_BOOK_PHONE_EWMA =
|
||||
new Continuous("realgraph.num_address_book_phone.ewma", Set(AddressBook).asJava)
|
||||
val NUM_ADDRESS_BOOK_PHONE_VARIANCE =
|
||||
new Continuous("realgraph.num_address_book_phone.variance", Set(AddressBook).asJava)
|
||||
val NUM_ADDRESS_BOOK_PHONE_NON_ZERO_DAYS = new Continuous(
|
||||
"realgraph.num_address_book_phone.non_zero_days",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
val NUM_ADDRESS_BOOK_PHONE_ELAPSED_DAYS = new Continuous(
|
||||
"realgraph.num_address_book_phone.elapsed_days",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
val NUM_ADDRESS_BOOK_PHONE_DAYS_SINCE_LAST = new Continuous(
|
||||
"realgraph.num_address_book_phone.days_since_last",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
val NUM_ADDRESS_BOOK_PHONE_IS_MISSING =
|
||||
new Binary("realgraph.num_address_book_phone.is_missing", Set(AddressBook).asJava)
|
||||
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_MEAN =
|
||||
new Continuous("realgraph.num_address_book_mutual_edge_email.mean", Set(AddressBook).asJava)
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_EWMA =
|
||||
new Continuous("realgraph.num_address_book_mutual_edge_email.ewma", Set(AddressBook).asJava)
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_VARIANCE =
|
||||
new Continuous("realgraph.num_address_book_mutual_edge_email.variance", Set(AddressBook).asJava)
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_NON_ZERO_DAYS = new Continuous(
|
||||
"realgraph.num_address_book_mutual_edge_email.non_zero_days",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_ELAPSED_DAYS = new Continuous(
|
||||
"realgraph.num_address_book_mutual_edge_email.elapsed_days",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_DAYS_SINCE_LAST = new Continuous(
|
||||
"realgraph.num_address_book_mutual_edge_email.days_since_last",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_IS_MISSING =
|
||||
new Binary("realgraph.num_address_book_mutual_edge_email.is_missing", Set(AddressBook).asJava)
|
||||
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_MEAN =
|
||||
new Continuous("realgraph.num_address_book_mutual_edge_in_both.mean", Set(AddressBook).asJava)
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_EWMA =
|
||||
new Continuous("realgraph.num_address_book_mutual_edge_in_both.ewma", Set(AddressBook).asJava)
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_VARIANCE = new Continuous(
|
||||
"realgraph.num_address_book_mutual_edge_in_both.variance",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_NON_ZERO_DAYS = new Continuous(
|
||||
"realgraph.num_address_book_mutual_edge_in_both.non_zero_days",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_ELAPSED_DAYS = new Continuous(
|
||||
"realgraph.num_address_book_mutual_edge_in_both.elapsed_days",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_DAYS_SINCE_LAST = new Continuous(
|
||||
"realgraph.num_address_book_mutual_edge_in_both.days_since_last",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_IS_MISSING = new Binary(
|
||||
"realgraph.num_address_book_mutual_edge_in_both.is_missing",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_MEAN =
|
||||
new Continuous("realgraph.num_address_book_mutual_edge_phone.mean", Set(AddressBook).asJava)
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_EWMA =
|
||||
new Continuous("realgraph.num_address_book_mutual_edge_phone.ewma", Set(AddressBook).asJava)
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_VARIANCE =
|
||||
new Continuous("realgraph.num_address_book_mutual_edge_phone.variance", Set(AddressBook).asJava)
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_NON_ZERO_DAYS = new Continuous(
|
||||
"realgraph.num_address_book_mutual_edge_phone.non_zero_days",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_ELAPSED_DAYS = new Continuous(
|
||||
"realgraph.num_address_book_mutual_edge_phone.elapsed_days",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_DAYS_SINCE_LAST = new Continuous(
|
||||
"realgraph.num_address_book_mutual_edge_phone.days_since_last",
|
||||
Set(AddressBook).asJava
|
||||
)
|
||||
val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_IS_MISSING =
|
||||
new Binary("realgraph.num_address_book_mutual_edge_phone.is_missing", Set(AddressBook).asJava)
|
||||
}
|
||||
|
||||
case class RealGraphEdgeDataRecordFeatures(
|
||||
edgeFeatureOpt: Option[RealGraphEdgeFeature],
|
||||
meanFeature: Continuous,
|
||||
ewmaFeature: Continuous,
|
||||
varianceFeature: Continuous,
|
||||
nonZeroDaysFeature: Continuous,
|
||||
elapsedDaysFeature: Continuous,
|
||||
daysSinceLastFeature: Continuous,
|
||||
isMissingFeature: Binary)
|
|
@ -0,0 +1,9 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,967 @@
|
|||
package com.twitter.timelines.prediction.features.recap
|
||||
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType._
|
||||
import com.twitter.ml.api.Feature.Binary
|
||||
import com.twitter.ml.api.Feature.Continuous
|
||||
import com.twitter.ml.api.Feature.Discrete
|
||||
import com.twitter.ml.api.Feature.SparseBinary
|
||||
import com.twitter.ml.api.Feature.Text
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object RecapFeatures extends RecapFeatures("")
|
||||
object InReplyToRecapFeatures extends RecapFeatures("in_reply_to_tweet")
|
||||
|
||||
class RecapFeatures(prefix: String) {
|
||||
private def name(featureName: String): String = {
|
||||
if (prefix.nonEmpty) {
|
||||
s"$prefix.$featureName"
|
||||
} else {
|
||||
featureName
|
||||
}
|
||||
}
|
||||
|
||||
val IS_IPAD_CLIENT = new Binary(name("recap.client.is_ipad"), Set(ClientType).asJava)
|
||||
val IS_WEB_CLIENT = new Binary(name("recap.client.is_web"), Set(ClientType).asJava)
|
||||
val IS_IPHONE_CLIENT = new Binary(name("recap.client.is_phone"), Set(ClientType).asJava)
|
||||
val IS_ANDROID_CLIENT = new Binary(name("recap.client.is_android"), Set(ClientType).asJava)
|
||||
val IS_ANDROID_TABLET_CLIENT =
|
||||
new Binary(name("recap.client.is_android_tablet"), Set(ClientType).asJava)
|
||||
|
||||
// features from userAgent
|
||||
val CLIENT_NAME = new Text(name("recap.user_agent.client_name"), Set(ClientType).asJava)
|
||||
val CLIENT_SOURCE = new Discrete(name("recap.user_agent.client_source"), Set(ClientType).asJava)
|
||||
val CLIENT_VERSION = new Text(name("recap.user_agent.client_version"), Set(ClientVersion).asJava)
|
||||
val CLIENT_VERSION_CODE =
|
||||
new Text(name("recap.user_agent.client_version_code"), Set(ClientVersion).asJava)
|
||||
val DEVICE = new Text(name("recap.user_agent.device"), Set(DeviceType).asJava)
|
||||
val FROM_DOG_FOOD = new Binary(name("recap.meta.from_dog_food"), Set(UserAgent).asJava)
|
||||
val FROM_TWITTER_CLIENT =
|
||||
new Binary(name("recap.user_agent.from_twitter_client"), Set(UserAgent).asJava)
|
||||
val MANUFACTURER = new Text(name("recap.user_agent.manufacturer"), Set(UserAgent).asJava)
|
||||
val MODEL = new Text(name("recap.user_agent.model"), Set(UserAgent).asJava)
|
||||
val NETWORK_CONNECTION =
|
||||
new Discrete(name("recap.user_agent.network_connection"), Set(UserAgent).asJava)
|
||||
val SDK_VERSION = new Text(name("recap.user_agent.sdk_version"), Set(AppId, UserAgent).asJava)
|
||||
|
||||
// engagement
|
||||
val IS_RETWEETED = new Binary(
|
||||
name("recap.engagement.is_retweeted"),
|
||||
Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_FAVORITED = new Binary(
|
||||
name("recap.engagement.is_favorited"),
|
||||
Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_REPLIED = new Binary(
|
||||
name("recap.engagement.is_replied"),
|
||||
Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
// v1: post click engagements: fav, reply
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_V1 = new Binary(
|
||||
name("recap.engagement.is_good_clicked_convo_desc_favorited_or_replied"),
|
||||
Set(
|
||||
PublicLikes,
|
||||
PrivateLikes,
|
||||
PublicReplies,
|
||||
PrivateReplies,
|
||||
EngagementsPrivate,
|
||||
EngagementsPublic).asJava)
|
||||
// v2: post click engagements: click
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_V2 = new Binary(
|
||||
name("recap.engagement.is_good_clicked_convo_desc_v2"),
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_FAVORITED = new Binary(
|
||||
name("recap.engagement.is_good_clicked_convo_desc_favorited"),
|
||||
Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_REPLIED = new Binary(
|
||||
name("recap.engagement.is_good_clicked_convo_desc_replied"),
|
||||
Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_RETWEETED = new Binary(
|
||||
name("recap.engagement.is_good_clicked_convo_desc_retweeted"),
|
||||
Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_CLICKED = new Binary(
|
||||
name("recap.engagement.is_good_clicked_convo_desc_clicked"),
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_FOLLOWED = new Binary(
|
||||
name("recap.engagement.is_good_clicked_convo_desc_followed"),
|
||||
Set(EngagementsPrivate).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_SHARE_DM_CLICKED = new Binary(
|
||||
name("recap.engagement.is_good_clicked_convo_desc_share_dm_clicked"),
|
||||
Set(EngagementsPrivate).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_PROFILE_CLICKED = new Binary(
|
||||
name("recap.engagement.is_good_clicked_convo_desc_profile_clicked"),
|
||||
Set(EngagementsPrivate).asJava)
|
||||
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_0 = new Binary(
|
||||
name("recap.engagement.is_good_clicked_convo_desc_uam_gt_0"),
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_1 = new Binary(
|
||||
name("recap.engagement.is_good_clicked_convo_desc_uam_gt_1"),
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_2 = new Binary(
|
||||
name("recap.engagement.is_good_clicked_convo_desc_uam_gt_2"),
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_3 = new Binary(
|
||||
name("recap.engagement.is_good_clicked_convo_desc_uam_gt_3"),
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
|
||||
val IS_TWEET_DETAIL_DWELLED = new Binary(
|
||||
name("recap.engagement.is_tweet_detail_dwelled"),
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_TWEET_DETAIL_DWELLED_8_SEC = new Binary(
|
||||
name("recap.engagement.is_tweet_detail_dwelled_8_sec"),
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_TWEET_DETAIL_DWELLED_15_SEC = new Binary(
|
||||
name("recap.engagement.is_tweet_detail_dwelled_15_sec"),
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_TWEET_DETAIL_DWELLED_25_SEC = new Binary(
|
||||
name("recap.engagement.is_tweet_detail_dwelled_25_sec"),
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_TWEET_DETAIL_DWELLED_30_SEC = new Binary(
|
||||
name("recap.engagement.is_tweet_detail_dwelled_30_sec"),
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_PROFILE_DWELLED = new Binary(
|
||||
"recap.engagement.is_profile_dwelled",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_DWELLED_10_SEC = new Binary(
|
||||
"recap.engagement.is_profile_dwelled_10_sec",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_DWELLED_20_SEC = new Binary(
|
||||
"recap.engagement.is_profile_dwelled_20_sec",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_DWELLED_30_SEC = new Binary(
|
||||
"recap.engagement.is_profile_dwelled_30_sec",
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_FULLSCREEN_VIDEO_DWELLED = new Binary(
|
||||
"recap.engagement.is_fullscreen_video_dwelled",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_FULLSCREEN_VIDEO_DWELLED_5_SEC = new Binary(
|
||||
"recap.engagement.is_fullscreen_video_dwelled_5_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_FULLSCREEN_VIDEO_DWELLED_10_SEC = new Binary(
|
||||
"recap.engagement.is_fullscreen_video_dwelled_10_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_FULLSCREEN_VIDEO_DWELLED_20_SEC = new Binary(
|
||||
"recap.engagement.is_fullscreen_video_dwelled_20_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_FULLSCREEN_VIDEO_DWELLED_30_SEC = new Binary(
|
||||
"recap.engagement.is_fullscreen_video_dwelled_30_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_LINK_DWELLED_15_SEC = new Binary(
|
||||
"recap.engagement.is_link_dwelled_15_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_LINK_DWELLED_30_SEC = new Binary(
|
||||
"recap.engagement.is_link_dwelled_30_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_LINK_DWELLED_60_SEC = new Binary(
|
||||
"recap.engagement.is_link_dwelled_60_sec",
|
||||
Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava)
|
||||
|
||||
val IS_QUOTED = new Binary(
|
||||
name("recap.engagement.is_quoted"),
|
||||
Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_RETWEETED_WITHOUT_QUOTE = new Binary(
|
||||
name("recap.engagement.is_retweeted_without_quote"),
|
||||
Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_CLICKED =
|
||||
new Binary(name("recap.engagement.is_clicked"), Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_DWELLED = new Binary(name("recap.engagement.is_dwelled"), Set(EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_IN_BOUNDS_V1 =
|
||||
new Binary(name("recap.engagement.is_dwelled_in_bounds_v1"), Set(EngagementsPrivate).asJava)
|
||||
val DWELL_NORMALIZED_OVERALL = new Continuous(
|
||||
name("recap.engagement.dwell_normalized_overall"),
|
||||
Set(EngagementsPrivate).asJava)
|
||||
val DWELL_CDF_OVERALL =
|
||||
new Continuous(name("recap.engagement.dwell_cdf_overall"), Set(EngagementsPrivate).asJava)
|
||||
val DWELL_CDF = new Continuous(name("recap.engagement.dwell_cdf"), Set(EngagementsPrivate).asJava)
|
||||
|
||||
val IS_DWELLED_1S =
|
||||
new Binary(name("recap.engagement.is_dwelled_1s"), Set(EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_2S =
|
||||
new Binary(name("recap.engagement.is_dwelled_2s"), Set(EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_3S =
|
||||
new Binary(name("recap.engagement.is_dwelled_3s"), Set(EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_4S =
|
||||
new Binary(name("recap.engagement.is_dwelled_4s"), Set(EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_5S =
|
||||
new Binary(name("recap.engagement.is_dwelled_5s"), Set(EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_6S =
|
||||
new Binary(name("recap.engagement.is_dwelled_6s"), Set(EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_7S =
|
||||
new Binary(name("recap.engagement.is_dwelled_7s"), Set(EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_8S =
|
||||
new Binary(name("recap.engagement.is_dwelled_8s"), Set(EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_9S =
|
||||
new Binary(name("recap.engagement.is_dwelled_9s"), Set(EngagementsPrivate).asJava)
|
||||
val IS_DWELLED_10S =
|
||||
new Binary(name("recap.engagement.is_dwelled_10s"), Set(EngagementsPrivate).asJava)
|
||||
|
||||
val IS_SKIPPED_1S =
|
||||
new Binary(name("recap.engagement.is_skipped_1s"), Set(EngagementsPrivate).asJava)
|
||||
val IS_SKIPPED_2S =
|
||||
new Binary(name("recap.engagement.is_skipped_2s"), Set(EngagementsPrivate).asJava)
|
||||
val IS_SKIPPED_3S =
|
||||
new Binary(name("recap.engagement.is_skipped_3s"), Set(EngagementsPrivate).asJava)
|
||||
val IS_SKIPPED_4S =
|
||||
new Binary(name("recap.engagement.is_skipped_4s"), Set(EngagementsPrivate).asJava)
|
||||
val IS_SKIPPED_5S =
|
||||
new Binary(name("recap.engagement.is_skipped_5s"), Set(EngagementsPrivate).asJava)
|
||||
val IS_SKIPPED_6S =
|
||||
new Binary(name("recap.engagement.is_skipped_6s"), Set(EngagementsPrivate).asJava)
|
||||
val IS_SKIPPED_7S =
|
||||
new Binary(name("recap.engagement.is_skipped_7s"), Set(EngagementsPrivate).asJava)
|
||||
val IS_SKIPPED_8S =
|
||||
new Binary(name("recap.engagement.is_skipped_8s"), Set(EngagementsPrivate).asJava)
|
||||
val IS_SKIPPED_9S =
|
||||
new Binary(name("recap.engagement.is_skipped_9s"), Set(EngagementsPrivate).asJava)
|
||||
val IS_SKIPPED_10S =
|
||||
new Binary(name("recap.engagement.is_skipped_10s"), Set(EngagementsPrivate).asJava)
|
||||
|
||||
val IS_IMPRESSED =
|
||||
new Binary(name("recap.engagement.is_impressed"), Set(EngagementsPrivate).asJava)
|
||||
val IS_FOLLOWED =
|
||||
new Binary("recap.engagement.is_followed", Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_PROFILE_CLICKED = new Binary(
|
||||
name("recap.engagement.is_profile_clicked"),
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_OPEN_LINKED = new Binary(
|
||||
name("recap.engagement.is_open_linked"),
|
||||
Set(EngagementsPrivate, LinksClickedOn).asJava)
|
||||
val IS_PHOTO_EXPANDED =
|
||||
new Binary(name("recap.engagement.is_photo_expanded"), Set(EngagementsPrivate).asJava)
|
||||
val IS_VIDEO_VIEWED =
|
||||
new Binary(name("recap.engagement.is_video_viewed"), Set(EngagementsPrivate).asJava)
|
||||
val IS_VIDEO_PLAYBACK_START =
|
||||
new Binary(name("recap.engagement.is_video_playback_start"), Set(EngagementsPrivate).asJava)
|
||||
val IS_VIDEO_PLAYBACK_25 =
|
||||
new Binary(name("recap.engagement.is_video_playback_25"), Set(EngagementsPrivate).asJava)
|
||||
val IS_VIDEO_PLAYBACK_50 =
|
||||
new Binary(name("recap.engagement.is_video_playback_50"), Set(EngagementsPrivate).asJava)
|
||||
val IS_VIDEO_PLAYBACK_75 =
|
||||
new Binary(name("recap.engagement.is_video_playback_75"), Set(EngagementsPrivate).asJava)
|
||||
val IS_VIDEO_PLAYBACK_95 =
|
||||
new Binary(name("recap.engagement.is_video_playback_95"), Set(EngagementsPrivate).asJava)
|
||||
val IS_VIDEO_PLAYBACK_COMPLETE =
|
||||
new Binary(name("recap.engagement.is_video_playback_complete"), Set(EngagementsPrivate).asJava)
|
||||
val IS_VIDEO_VIEWED_AND_PLAYBACK_50 = new Binary(
|
||||
name("recap.engagement.is_video_viewed_and_playback_50"),
|
||||
Set(EngagementsPrivate).asJava)
|
||||
val IS_VIDEO_QUALITY_VIEWED = new Binary(
|
||||
name("recap.engagement.is_video_quality_viewed"),
|
||||
Set(EngagementsPrivate).asJava
|
||||
)
|
||||
val IS_TWEET_SHARE_DM_CLICKED =
|
||||
new Binary(name("recap.engagement.is_tweet_share_dm_clicked"), Set(EngagementsPrivate).asJava)
|
||||
val IS_TWEET_SHARE_DM_SENT =
|
||||
new Binary(name("recap.engagement.is_tweet_share_dm_sent"), Set(EngagementsPrivate).asJava)
|
||||
val IS_BOOKMARKED =
|
||||
new Binary(name("recap.engagement.is_bookmarked"), Set(EngagementsPrivate).asJava)
|
||||
val IS_SHARED =
|
||||
new Binary(name("recap.engagement.is_shared"), Set(EngagementsPrivate).asJava)
|
||||
val IS_SHARE_MENU_CLICKED =
|
||||
new Binary(name("recap.engagement.is_share_menu_clicked"), Set(EngagementsPrivate).asJava)
|
||||
|
||||
// Negative engagements
|
||||
val IS_DONT_LIKE =
|
||||
new Binary(name("recap.engagement.is_dont_like"), Set(EngagementsPrivate).asJava)
|
||||
val IS_BLOCK_CLICKED = new Binary(
|
||||
name("recap.engagement.is_block_clicked"),
|
||||
Set(TweetsClicked, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_BLOCK_DIALOG_BLOCKED = new Binary(
|
||||
name("recap.engagement.is_block_dialog_blocked"),
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_MUTE_CLICKED = new Binary(
|
||||
name("recap.engagement.is_mute_clicked"),
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_MUTE_DIALOG_MUTED =
|
||||
new Binary(name("recap.engagement.is_mute_dialog_muted"), Set(EngagementsPrivate).asJava)
|
||||
val IS_REPORT_TWEET_CLICKED = new Binary(
|
||||
name("recap.engagement.is_report_tweet_clicked"),
|
||||
Set(TweetsClicked, EngagementsPrivate).asJava)
|
||||
val IS_NEGATIVE_FEEDBACK =
|
||||
new Binary("recap.engagement.is_negative_feedback", Set(EngagementsPrivate).asJava)
|
||||
val IS_NOT_ABOUT_TOPIC =
|
||||
new Binary(name("recap.engagement.is_not_about_topic"), Set(EngagementsPrivate).asJava)
|
||||
val IS_NOT_RECENT =
|
||||
new Binary(name("recap.engagement.is_not_recent"), Set(EngagementsPrivate).asJava)
|
||||
val IS_NOT_RELEVANT =
|
||||
new Binary(name("recap.engagement.is_not_relevant"), Set(EngagementsPrivate).asJava)
|
||||
val IS_SEE_FEWER =
|
||||
new Binary(name("recap.engagement.is_see_fewer"), Set(EngagementsPrivate).asJava)
|
||||
val IS_TOPIC_SPEC_NEG_ENGAGEMENT =
|
||||
new Binary("recap.engagement.is_topic_spec_neg_engagement", Set(EngagementsPrivate).asJava)
|
||||
val IS_UNFOLLOW_TOPIC =
|
||||
new Binary("recap.engagement.is_unfollow_topic", Set(EngagementsPrivate).asJava)
|
||||
val IS_UNFOLLOW_TOPIC_EXPLICIT_POSITIVE_LABEL =
|
||||
new Binary(
|
||||
"recap.engagement.is_unfollow_topic_explicit_positive_label",
|
||||
Set(EngagementsPrivate).asJava)
|
||||
val IS_UNFOLLOW_TOPIC_IMPLICIT_POSITIVE_LABEL =
|
||||
new Binary(
|
||||
"recap.engagement.is_unfollow_topic_implicit_positive_label",
|
||||
Set(EngagementsPrivate).asJava)
|
||||
val IS_UNFOLLOW_TOPIC_STRONG_EXPLICIT_NEGATIVE_LABEL =
|
||||
new Binary(
|
||||
"recap.engagement.is_unfollow_topic_strong_explicit_negative_label",
|
||||
Set(EngagementsPrivate).asJava)
|
||||
val IS_UNFOLLOW_TOPIC_EXPLICIT_NEGATIVE_LABEL =
|
||||
new Binary(
|
||||
"recap.engagement.is_unfollow_topic_explicit_negative_label",
|
||||
Set(EngagementsPrivate).asJava)
|
||||
val IS_NOT_INTERESTED_IN =
|
||||
new Binary("recap.engagement.is_not_interested_in", Set(EngagementsPrivate).asJava)
|
||||
val IS_NOT_INTERESTED_IN_EXPLICIT_POSITIVE_LABEL =
|
||||
new Binary(
|
||||
"recap.engagement.is_not_interested_in_explicit_positive_label",
|
||||
Set(EngagementsPrivate).asJava)
|
||||
val IS_NOT_INTERESTED_IN_EXPLICIT_NEGATIVE_LABEL =
|
||||
new Binary(
|
||||
"recap.engagement.is_not_interested_in_explicit_negative_label",
|
||||
Set(EngagementsPrivate).asJava)
|
||||
val IS_CARET_CLICKED =
|
||||
new Binary(name("recap.engagement.is_caret_clicked"), Set(EngagementsPrivate).asJava)
|
||||
val IS_FOLLOW_TOPIC =
|
||||
new Binary("recap.engagement.is_follow_topic", Set(EngagementsPrivate).asJava)
|
||||
val IS_NOT_INTERESTED_IN_TOPIC =
|
||||
new Binary("recap.engagement.is_not_interested_in_topic", Set(EngagementsPrivate).asJava)
|
||||
val IS_HOME_LATEST_VISITED =
|
||||
new Binary(name("recap.engagement.is_home_latest_visited"), Set(EngagementsPrivate).asJava)
|
||||
|
||||
// Relevance prompt tweet engagements
|
||||
val IS_RELEVANCE_PROMPT_YES_CLICKED = new Binary(
|
||||
name("recap.engagement.is_relevance_prompt_yes_clicked"),
|
||||
Set(EngagementsPrivate).asJava)
|
||||
val IS_RELEVANCE_PROMPT_NO_CLICKED = new Binary(
|
||||
name("recap.engagement.is_relevance_prompt_no_clicked"),
|
||||
Set(EngagementsPrivate).asJava)
|
||||
val IS_RELEVANCE_PROMPT_IMPRESSED = new Binary(
|
||||
name("recap.engagement.is_relevance_prompt_impressed"),
|
||||
Set(EngagementsPrivate).asJava)
|
||||
|
||||
// Reciprocal engagements for reply forward engagement
|
||||
val IS_REPLIED_REPLY_IMPRESSED_BY_AUTHOR = new Binary(
|
||||
name("recap.engagement.is_replied_reply_impressed_by_author"),
|
||||
Set(EngagementsPrivate).asJava)
|
||||
val IS_REPLIED_REPLY_FAVORITED_BY_AUTHOR = new Binary(
|
||||
name("recap.engagement.is_replied_reply_favorited_by_author"),
|
||||
Set(EngagementsPrivate, EngagementsPublic, PrivateLikes, PublicLikes).asJava)
|
||||
val IS_REPLIED_REPLY_QUOTED_BY_AUTHOR = new Binary(
|
||||
name("recap.engagement.is_replied_reply_quoted_by_author"),
|
||||
Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava)
|
||||
val IS_REPLIED_REPLY_REPLIED_BY_AUTHOR = new Binary(
|
||||
name("recap.engagement.is_replied_reply_replied_by_author"),
|
||||
Set(EngagementsPrivate, EngagementsPublic, PrivateReplies, PublicReplies).asJava)
|
||||
val IS_REPLIED_REPLY_RETWEETED_BY_AUTHOR = new Binary(
|
||||
name("recap.engagement.is_replied_reply_retweeted_by_author"),
|
||||
Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava)
|
||||
val IS_REPLIED_REPLY_BLOCKED_BY_AUTHOR = new Binary(
|
||||
name("recap.engagement.is_replied_reply_blocked_by_author"),
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_REPLIED_REPLY_FOLLOWED_BY_AUTHOR = new Binary(
|
||||
name("recap.engagement.is_replied_reply_followed_by_author"),
|
||||
Set(EngagementsPrivate, EngagementsPublic, Follow).asJava)
|
||||
val IS_REPLIED_REPLY_UNFOLLOWED_BY_AUTHOR = new Binary(
|
||||
name("recap.engagement.is_replied_reply_unfollowed_by_author"),
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_REPLIED_REPLY_MUTED_BY_AUTHOR = new Binary(
|
||||
name("recap.engagement.is_replied_reply_muted_by_author"),
|
||||
Set(EngagementsPrivate).asJava)
|
||||
val IS_REPLIED_REPLY_REPORTED_BY_AUTHOR = new Binary(
|
||||
name("recap.engagement.is_replied_reply_reported_by_author"),
|
||||
Set(EngagementsPrivate).asJava)
|
||||
|
||||
// This derived label is the logical OR of REPLY_REPLIED, REPLY_FAVORITED, REPLY_RETWEETED
|
||||
val IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR = new Binary(
|
||||
name("recap.engagement.is_replied_reply_engaged_by_author"),
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
|
||||
// Reciprocal engagements for fav forward engagement
|
||||
val IS_FAVORITED_FAV_FAVORITED_BY_AUTHOR = new Binary(
|
||||
name("recap.engagement.is_favorited_fav_favorited_by_author"),
|
||||
Set(EngagementsPrivate, EngagementsPublic, PrivateLikes, PublicLikes).asJava
|
||||
)
|
||||
val IS_FAVORITED_FAV_REPLIED_BY_AUTHOR = new Binary(
|
||||
name("recap.engagement.is_favorited_fav_replied_by_author"),
|
||||
Set(EngagementsPrivate, EngagementsPublic, PrivateReplies, PublicReplies).asJava
|
||||
)
|
||||
val IS_FAVORITED_FAV_RETWEETED_BY_AUTHOR = new Binary(
|
||||
name("recap.engagement.is_favorited_fav_retweeted_by_author"),
|
||||
Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava
|
||||
)
|
||||
val IS_FAVORITED_FAV_FOLLOWED_BY_AUTHOR = new Binary(
|
||||
name("recap.engagement.is_favorited_fav_followed_by_author"),
|
||||
Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava
|
||||
)
|
||||
// This derived label is the logical OR of FAV_REPLIED, FAV_FAVORITED, FAV_RETWEETED, FAV_FOLLOWED
|
||||
val IS_FAVORITED_FAV_ENGAGED_BY_AUTHOR = new Binary(
|
||||
name("recap.engagement.is_favorited_fav_engaged_by_author"),
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava)
|
||||
|
||||
// define good profile click by considering following engagements (follow, fav, reply, retweet, etc.) at profile page
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_FOLLOW = new Binary(
|
||||
name("recap.engagement.is_profile_clicked_and_profile_follow"),
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, Follow).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_FAV = new Binary(
|
||||
name("recap.engagement.is_profile_clicked_and_profile_fav"),
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, PrivateLikes, PublicLikes).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_REPLY = new Binary(
|
||||
name("recap.engagement.is_profile_clicked_and_profile_reply"),
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, PrivateReplies, PublicReplies).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_RETWEET = new Binary(
|
||||
name("recap.engagement.is_profile_clicked_and_profile_retweet"),
|
||||
Set(
|
||||
ProfilesViewed,
|
||||
ProfilesClicked,
|
||||
EngagementsPrivate,
|
||||
PrivateRetweets,
|
||||
PublicRetweets).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_TWEET_CLICK = new Binary(
|
||||
name("recap.engagement.is_profile_clicked_and_profile_tweet_click"),
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, TweetsClicked).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_SHARE_DM_CLICK = new Binary(
|
||||
name("recap.engagement.is_profile_clicked_and_profile_share_dm_click"),
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
// This derived label is the union of all binary features above
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED = new Binary(
|
||||
name("recap.engagement.is_profile_clicked_and_profile_engaged"),
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
|
||||
// define bad profile click by considering following engagements (user report, tweet report, mute, block, etc) at profile page
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_USER_REPORT_CLICK = new Binary(
|
||||
name("recap.engagement.is_profile_clicked_and_profile_user_report_click"),
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_TWEET_REPORT_CLICK = new Binary(
|
||||
name("recap.engagement.is_profile_clicked_and_profile_tweet_report_click"),
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_MUTE = new Binary(
|
||||
name("recap.engagement.is_profile_clicked_and_profile_mute"),
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_PROFILE_CLICKED_AND_PROFILE_BLOCK = new Binary(
|
||||
name("recap.engagement.is_profile_clicked_and_profile_block"),
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
// This derived label is the union of bad profile click engagements and existing negative feedback
|
||||
val IS_NEGATIVE_FEEDBACK_V2 = new Binary(
|
||||
name("recap.engagement.is_negative_feedback_v2"),
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_STRONG_NEGATIVE_FEEDBACK = new Binary(
|
||||
name("recap.engagement.is_strong_negative_feedback"),
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
val IS_WEAK_NEGATIVE_FEEDBACK = new Binary(
|
||||
name("recap.engagement.is_weak_negative_feedback"),
|
||||
Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava)
|
||||
// engagement for following user from any surface area
|
||||
val IS_FOLLOWED_FROM_ANY_SURFACE_AREA = new Binary(
|
||||
"recap.engagement.is_followed_from_any_surface_area",
|
||||
Set(EngagementsPublic, EngagementsPrivate).asJava)
|
||||
|
||||
// Reply downvote engagements
|
||||
val IS_REPLY_DOWNVOTED =
|
||||
new Binary(name("recap.engagement.is_reply_downvoted"), Set(EngagementsPrivate).asJava)
|
||||
val IS_REPLY_DOWNVOTE_REMOVED =
|
||||
new Binary(name("recap.engagement.is_reply_downvote_removed"), Set(EngagementsPrivate).asJava)
|
||||
|
||||
// Other engagements
|
||||
val IS_GOOD_OPEN_LINK = new Binary(
|
||||
name("recap.engagement.is_good_open_link"),
|
||||
Set(EngagementsPrivate, LinksClickedOn).asJava)
|
||||
val IS_ENGAGED = new Binary(
|
||||
name("recap.engagement.any"),
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava
|
||||
) // Deprecated - to be removed shortly
|
||||
val IS_EARLYBIRD_UNIFIED_ENGAGEMENT = new Binary(
|
||||
name("recap.engagement.is_unified_engagement"),
|
||||
Set(EngagementsPrivate, EngagementsPublic).asJava
|
||||
) // A subset of IS_ENGAGED specifically intended for use in earlybird models
|
||||
|
||||
// features from ThriftTweetFeatures
|
||||
val PREV_USER_TWEET_ENGAGEMENT = new Continuous(
|
||||
name("recap.tweetfeature.prev_user_tweet_enagagement"),
|
||||
Set(EngagementScore, EngagementsPrivate, EngagementsPublic).asJava)
|
||||
val IS_SENSITIVE = new Binary(name("recap.tweetfeature.is_sensitive"))
|
||||
val HAS_MULTIPLE_MEDIA = new Binary(
|
||||
name("recap.tweetfeature.has_multiple_media"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val IS_AUTHOR_PROFILE_EGG = new Binary(name("recap.tweetfeature.is_author_profile_egg"))
|
||||
val IS_AUTHOR_NEW =
|
||||
new Binary(name("recap.tweetfeature.is_author_new"), Set(UserState, UserType).asJava)
|
||||
val NUM_MENTIONS = new Continuous(
|
||||
name("recap.tweetfeature.num_mentions"),
|
||||
Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_MENTION = new Binary(name("recap.tweetfeature.has_mention"), Set(UserVisibleFlag).asJava)
|
||||
val NUM_HASHTAGS = new Continuous(
|
||||
name("recap.tweetfeature.num_hashtags"),
|
||||
Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_HASHTAG = new Binary(
|
||||
name("recap.tweetfeature.has_hashtag"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val LINK_LANGUAGE = new Continuous(
|
||||
name("recap.tweetfeature.link_language"),
|
||||
Set(ProvidedLanguage, InferredLanguage).asJava)
|
||||
val IS_AUTHOR_NSFW =
|
||||
new Binary(name("recap.tweetfeature.is_author_nsfw"), Set(UserSafetyLabels, UserType).asJava)
|
||||
val IS_AUTHOR_SPAM =
|
||||
new Binary(name("recap.tweetfeature.is_author_spam"), Set(UserSafetyLabels, UserType).asJava)
|
||||
val IS_AUTHOR_BOT =
|
||||
new Binary(name("recap.tweetfeature.is_author_bot"), Set(UserSafetyLabels, UserType).asJava)
|
||||
val SIGNATURE =
|
||||
new Discrete(name("recap.tweetfeature.signature"), Set(DigitalSignatureNonrepudiation).asJava)
|
||||
val LANGUAGE = new Discrete(
|
||||
name("recap.tweetfeature.language"),
|
||||
Set(ProvidedLanguage, InferredLanguage).asJava)
|
||||
val FROM_INACTIVE_USER =
|
||||
new Binary(name("recap.tweetfeature.from_inactive_user"), Set(UserActiveFlag).asJava)
|
||||
val PROBABLY_FROM_FOLLOWED_AUTHOR = new Binary(name("recap.v3.tweetfeature.probably_from_follow"))
|
||||
val FROM_MUTUAL_FOLLOW = new Binary(name("recap.tweetfeature.from_mutual_follow"))
|
||||
val USER_REP = new Continuous(name("recap.tweetfeature.user_rep"))
|
||||
val FROM_VERIFIED_ACCOUNT =
|
||||
new Binary(name("recap.tweetfeature.from_verified_account"), Set(UserVerifiedFlag).asJava)
|
||||
val IS_BUSINESS_SCORE = new Continuous(name("recap.tweetfeature.is_business_score"))
|
||||
val HAS_CONSUMER_VIDEO = new Binary(
|
||||
name("recap.tweetfeature.has_consumer_video"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_PRO_VIDEO = new Binary(
|
||||
name("recap.tweetfeature.has_pro_video"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_VINE = new Binary(
|
||||
name("recap.tweetfeature.has_vine"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_PERISCOPE = new Binary(
|
||||
name("recap.tweetfeature.has_periscope"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_NATIVE_VIDEO = new Binary(
|
||||
name("recap.tweetfeature.has_native_video"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_NATIVE_IMAGE = new Binary(
|
||||
name("recap.tweetfeature.has_native_image"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_CARD = new Binary(
|
||||
name("recap.tweetfeature.has_card"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_IMAGE = new Binary(
|
||||
name("recap.tweetfeature.has_image"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_NEWS = new Binary(
|
||||
name("recap.tweetfeature.has_news"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_VIDEO = new Binary(
|
||||
name("recap.tweetfeature.has_video"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_VISIBLE_LINK = new Binary(
|
||||
name("recap.tweetfeature.has_visible_link"),
|
||||
Set(UrlFoundFlag, PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val LINK_COUNT = new Continuous(
|
||||
name("recap.tweetfeature.link_count"),
|
||||
Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_LINK = new Binary(
|
||||
name("recap.tweetfeature.has_link"),
|
||||
Set(UrlFoundFlag, PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val IS_OFFENSIVE = new Binary(name("recap.tweetfeature.is_offensive"))
|
||||
val HAS_TREND = new Binary(
|
||||
name("recap.tweetfeature.has_trend"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val HAS_MULTIPLE_HASHTAGS_OR_TRENDS = new Binary(
|
||||
name("recap.tweetfeature.has_multiple_hashtag_or_trend"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val URL_DOMAINS = new SparseBinary(
|
||||
name("recap.tweetfeature.url_domains"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val CONTAINS_MEDIA = new Binary(
|
||||
name("recap.tweetfeature.contains_media"),
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val RETWEET_SEARCHER = new Binary(name("recap.tweetfeature.retweet_searcher"))
|
||||
val REPLY_SEARCHER = new Binary(name("recap.tweetfeature.reply_searcher"))
|
||||
val MENTION_SEARCHER =
|
||||
new Binary(name("recap.tweetfeature.mention_searcher"), Set(UserVisibleFlag).asJava)
|
||||
val REPLY_OTHER =
|
||||
new Binary(name("recap.tweetfeature.reply_other"), Set(PublicReplies, PrivateReplies).asJava)
|
||||
val RETWEET_OTHER = new Binary(
|
||||
name("recap.tweetfeature.retweet_other"),
|
||||
Set(PublicRetweets, PrivateRetweets).asJava)
|
||||
val IS_REPLY =
|
||||
new Binary(name("recap.tweetfeature.is_reply"), Set(PublicReplies, PrivateReplies).asJava)
|
||||
val IS_RETWEET =
|
||||
new Binary(name("recap.tweetfeature.is_retweet"), Set(PublicRetweets, PrivateRetweets).asJava)
|
||||
val IS_EXTENDED_REPLY = new Binary(
|
||||
name("recap.tweetfeature.is_extended_reply"),
|
||||
Set(PublicReplies, PrivateReplies).asJava)
|
||||
val MATCH_UI_LANG = new Binary(
|
||||
name("recap.tweetfeature.match_ui_lang"),
|
||||
Set(ProvidedLanguage, InferredLanguage).asJava)
|
||||
val MATCH_SEARCHER_MAIN_LANG = new Binary(
|
||||
name("recap.tweetfeature.match_searcher_main_lang"),
|
||||
Set(ProvidedLanguage, InferredLanguage).asJava)
|
||||
val MATCH_SEARCHER_LANGS = new Binary(
|
||||
name("recap.tweetfeature.match_searcher_langs"),
|
||||
Set(ProvidedLanguage, InferredLanguage).asJava)
|
||||
val BIDIRECTIONAL_REPLY_COUNT = new Continuous(
|
||||
name("recap.tweetfeature.bidirectional_reply_count"),
|
||||
Set(CountOfPrivateReplies, CountOfPublicReplies).asJava)
|
||||
val UNIDIRECTIONAL_REPLY_COUNT = new Continuous(
|
||||
name("recap.tweetfeature.unidirectional_reply_count"),
|
||||
Set(CountOfPrivateReplies, CountOfPublicReplies).asJava)
|
||||
val BIDIRECTIONAL_RETWEET_COUNT = new Continuous(
|
||||
name("recap.tweetfeature.bidirectional_retweet_count"),
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava)
|
||||
val UNIDIRECTIONAL_RETWEET_COUNT = new Continuous(
|
||||
name("recap.tweetfeature.unidirectional_retweet_count"),
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava)
|
||||
val BIDIRECTIONAL_FAV_COUNT = new Continuous(
|
||||
name("recap.tweetfeature.bidirectional_fav_count"),
|
||||
Set(CountOfPrivateLikes, CountOfPublicLikes).asJava)
|
||||
val UNIDIRECTIONAL_FAV_COUNT = new Continuous(
|
||||
name("recap.tweetfeature.unidirectiona_fav_count"),
|
||||
Set(CountOfPrivateLikes, CountOfPublicLikes).asJava)
|
||||
val CONVERSATIONAL_COUNT = new Continuous(
|
||||
name("recap.tweetfeature.conversational_count"),
|
||||
Set(CountOfPrivateTweets, CountOfPublicTweets).asJava)
|
||||
// tweet impressions on an embedded tweet
|
||||
val EMBEDS_IMPRESSION_COUNT = new Continuous(
|
||||
name("recap.tweetfeature.embeds_impression_count"),
|
||||
Set(CountOfImpression).asJava)
|
||||
// number of URLs that embed the tweet
|
||||
val EMBEDS_URL_COUNT = new Continuous(
|
||||
name("recap.tweetfeature.embeds_url_count"),
|
||||
Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava)
|
||||
// currently only counts views on Snappy and Amplify pro videos. Counts for other videos forthcoming
|
||||
val VIDEO_VIEW_COUNT = new Continuous(
|
||||
name("recap.tweetfeature.video_view_count"),
|
||||
Set(
|
||||
CountOfTweetEntitiesClicked,
|
||||
CountOfPrivateTweetEntitiesAndMetadata,
|
||||
CountOfPublicTweetEntitiesAndMetadata,
|
||||
EngagementsPrivate,
|
||||
EngagementsPublic).asJava
|
||||
)
|
||||
val TWEET_COUNT_FROM_USER_IN_SNAPSHOT = new Continuous(
|
||||
name("recap.tweetfeature.tweet_count_from_user_in_snapshot"),
|
||||
Set(CountOfPrivateTweets, CountOfPublicTweets).asJava)
|
||||
val NORMALIZED_PARUS_SCORE =
|
||||
new Continuous("recap.tweetfeature.normalized_parus_score", Set(EngagementScore).asJava)
|
||||
val PARUS_SCORE = new Continuous("recap.tweetfeature.parus_score", Set(EngagementScore).asJava)
|
||||
val REAL_GRAPH_WEIGHT =
|
||||
new Continuous("recap.tweetfeature.real_graph_weight", Set(UsersRealGraphScore).asJava)
|
||||
val SARUS_GRAPH_WEIGHT = new Continuous("recap.tweetfeature.sarus_graph_weight")
|
||||
val TOPIC_SIM_SEARCHER_INTERSTED_IN_AUTHOR_KNOWN_FOR = new Continuous(
|
||||
"recap.tweetfeature.topic_sim_searcher_interested_in_author_known_for")
|
||||
val TOPIC_SIM_SEARCHER_AUTHOR_BOTH_INTERESTED_IN = new Continuous(
|
||||
"recap.tweetfeature.topic_sim_searcher_author_both_interested_in")
|
||||
val TOPIC_SIM_SEARCHER_AUTHOR_BOTH_KNOWN_FOR = new Continuous(
|
||||
"recap.tweetfeature.topic_sim_searcher_author_both_known_for")
|
||||
val TOPIC_SIM_SEARCHER_INTERESTED_IN_TWEET = new Continuous(
|
||||
"recap.tweetfeature.topic_sim_searcher_interested_in_tweet")
|
||||
val IS_RETWEETER_PROFILE_EGG =
|
||||
new Binary(name("recap.v2.tweetfeature.is_retweeter_profile_egg"), Set(UserType).asJava)
|
||||
val IS_RETWEETER_NEW =
|
||||
new Binary(name("recap.v2.tweetfeature.is_retweeter_new"), Set(UserType, UserState).asJava)
|
||||
val IS_RETWEETER_BOT =
|
||||
new Binary(
|
||||
name("recap.v2.tweetfeature.is_retweeter_bot"),
|
||||
Set(UserType, UserSafetyLabels).asJava)
|
||||
val IS_RETWEETER_NSFW =
|
||||
new Binary(
|
||||
name("recap.v2.tweetfeature.is_retweeter_nsfw"),
|
||||
Set(UserType, UserSafetyLabels).asJava)
|
||||
val IS_RETWEETER_SPAM =
|
||||
new Binary(
|
||||
name("recap.v2.tweetfeature.is_retweeter_spam"),
|
||||
Set(UserType, UserSafetyLabels).asJava)
|
||||
val RETWEET_OF_MUTUAL_FOLLOW = new Binary(
|
||||
name("recap.v2.tweetfeature.retweet_of_mutual_follow"),
|
||||
Set(PublicRetweets, PrivateRetweets).asJava)
|
||||
val SOURCE_AUTHOR_REP = new Continuous(name("recap.v2.tweetfeature.source_author_rep"))
|
||||
val IS_RETWEET_OF_REPLY = new Binary(
|
||||
name("recap.v2.tweetfeature.is_retweet_of_reply"),
|
||||
Set(PublicRetweets, PrivateRetweets).asJava)
|
||||
val RETWEET_DIRECTED_AT_USER_IN_FIRST_DEGREE = new Binary(
|
||||
name("recap.v2.tweetfeature.is_retweet_directed_at_user_in_first_degree"),
|
||||
Set(PublicRetweets, PrivateRetweets, Follow).asJava)
|
||||
val MENTIONED_SCREEN_NAMES = new SparseBinary(
|
||||
"entities.users.mentioned_screen_names",
|
||||
Set(DisplayName, UserVisibleFlag).asJava)
|
||||
val MENTIONED_SCREEN_NAME = new Text(
|
||||
"entities.users.mentioned_screen_names.member",
|
||||
Set(DisplayName, UserVisibleFlag).asJava)
|
||||
val HASHTAGS = new SparseBinary(
|
||||
"entities.hashtags",
|
||||
Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava)
|
||||
val URL_SLUGS = new SparseBinary(name("recap.linkfeature.url_slugs"), Set(UrlFoundFlag).asJava)
|
||||
|
||||
// features from ThriftSearchResultMetadata
|
||||
val REPLY_COUNT = new Continuous(
|
||||
name("recap.searchfeature.reply_count"),
|
||||
Set(CountOfPrivateReplies, CountOfPublicReplies).asJava)
|
||||
val RETWEET_COUNT = new Continuous(
|
||||
name("recap.searchfeature.retweet_count"),
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava)
|
||||
val FAV_COUNT = new Continuous(
|
||||
name("recap.searchfeature.fav_count"),
|
||||
Set(CountOfPrivateLikes, CountOfPublicLikes).asJava)
|
||||
val BLENDER_SCORE = new Continuous(name("recap.searchfeature.blender_score"))
|
||||
val TEXT_SCORE = new Continuous(name("recap.searchfeature.text_score"))
|
||||
|
||||
// features related to content source
|
||||
val SOURCE_TYPE = new Discrete(name("recap.source.type"))
|
||||
|
||||
// features from addressbook
|
||||
// the author is in the user's email addressbook
|
||||
val USER_TO_AUTHOR_EMAIL_REACHABLE =
|
||||
new Binary(name("recap.addressbook.user_to_author_email_reachable"), Set(AddressBook).asJava)
|
||||
// the author is in the user's phone addressbook
|
||||
val USER_TO_AUTHOR_PHONE_REACHABLE =
|
||||
new Binary(name("recap.addressbook.user_to_author_phone_reachable"), Set(AddressBook).asJava)
|
||||
// the user is in the author's email addressbook
|
||||
val AUTHOR_TO_USER_EMAIL_REACHABLE =
|
||||
new Binary(name("recap.addressbook.author_to_user_email_reachable"), Set(AddressBook).asJava)
|
||||
// the user is in the user's phone addressbook
|
||||
val AUTHOR_TO_USER_PHONE_REACHABLE =
|
||||
new Binary(name("recap.addressbook.author_to_user_phone_reachable"), Set(AddressBook).asJava)
|
||||
|
||||
// predicted engagement (these features are used by prediction service to return the predicted engagement probability)
|
||||
// these should match the names in engagement_to_score_feature_mapping
|
||||
val PREDICTED_IS_FAVORITED =
|
||||
new Continuous(name("recap.engagement_predicted.is_favorited"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_RETWEETED =
|
||||
new Continuous(name("recap.engagement_predicted.is_retweeted"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_QUOTED =
|
||||
new Continuous(name("recap.engagement_predicted.is_quoted"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_REPLIED =
|
||||
new Continuous(name("recap.engagement_predicted.is_replied"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_GOOD_OPEN_LINK = new Continuous(
|
||||
name("recap.engagement_predicted.is_good_open_link"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_PROFILE_CLICKED = new Continuous(
|
||||
name("recap.engagement_predicted.is_profile_clicked"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED = new Continuous(
|
||||
name("recap.engagement_predicted.is_profile_clicked_and_profile_engaged"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_CLICKED =
|
||||
new Continuous(name("recap.engagement_predicted.is_clicked"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_PHOTO_EXPANDED = new Continuous(
|
||||
name("recap.engagement_predicted.is_photo_expanded"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_DONT_LIKE =
|
||||
new Continuous(name("recap.engagement_predicted.is_dont_like"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_VIDEO_PLAYBACK_50 = new Continuous(
|
||||
name("recap.engagement_predicted.is_video_playback_50"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_VIDEO_QUALITY_VIEWED = new Continuous(
|
||||
name("recap.engagement_predicted.is_video_quality_viewed"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_BOOKMARKED =
|
||||
new Continuous(name("recap.engagement_predicted.is_bookmarked"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_SHARED =
|
||||
new Continuous(name("recap.engagement_predicted.is_shared"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_SHARE_MENU_CLICKED =
|
||||
new Continuous(
|
||||
name("recap.engagement_predicted.is_share_menu_clicked"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_PROFILE_DWELLED_20_SEC = new Continuous(
|
||||
name("recap.engagement_predicted.is_profile_dwelled_20_sec"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_5_SEC = new Continuous(
|
||||
name("recap.engagement_predicted.is_fullscreen_video_dwelled_5_sec"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_10_SEC = new Continuous(
|
||||
name("recap.engagement_predicted.is_fullscreen_video_dwelled_10_sec"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_20_SEC = new Continuous(
|
||||
name("recap.engagement_predicted.is_fullscreen_video_dwelled_20_sec"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_30_SEC = new Continuous(
|
||||
name("recap.engagement_predicted.is_fullscreen_video_dwelled_30_sec"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_UNIFIED_ENGAGEMENT = new Continuous(
|
||||
name("recap.engagement_predicted.is_unified_engagement"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_COMPOSE_TRIGGERED = new Continuous(
|
||||
name("recap.engagement_predicted.is_compose_triggered"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_REPLIED_REPLY_IMPRESSED_BY_AUTHOR = new Continuous(
|
||||
name("recap.engagement_predicted.is_replied_reply_impressed_by_author"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR = new Continuous(
|
||||
name("recap.engagement_predicted.is_replied_reply_engaged_by_author"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_GOOD_CLICKED_V1 = new Continuous(
|
||||
name("recap.engagement_predicted.is_good_clicked_convo_desc_favorited_or_replied"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_GOOD_CLICKED_V2 = new Continuous(
|
||||
name("recap.engagement_predicted.is_good_clicked_convo_desc_v2"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_TWEET_DETAIL_DWELLED_8_SEC = new Continuous(
|
||||
name("recap.engagement_predicted.is_tweet_detail_dwelled_8_sec"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_TWEET_DETAIL_DWELLED_15_SEC = new Continuous(
|
||||
name("recap.engagement_predicted.is_tweet_detail_dwelled_15_sec"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_TWEET_DETAIL_DWELLED_25_SEC = new Continuous(
|
||||
name("recap.engagement_predicted.is_tweet_detail_dwelled_25_sec"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_TWEET_DETAIL_DWELLED_30_SEC = new Continuous(
|
||||
name("recap.engagement_predicted.is_tweet_detail_dwelled_30_sec"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_FAVORITED_FAV_ENGAGED_BY_AUTHOR = new Continuous(
|
||||
name("recap.engagement_predicted.is_favorited_fav_engaged_by_author"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_GOOD_CLICKED_WITH_DWELL_SUM_GTE_60S = new Continuous(
|
||||
name(
|
||||
"recap.engagement_predicted.is_good_clicked_convo_desc_favorited_or_replied_or_dwell_sum_gte_60_secs"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_DWELLED_IN_BOUNDS_V1 = new Continuous(
|
||||
name("recap.engagement_predicted.is_dwelled_in_bounds_v1"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_DWELL_NORMALIZED_OVERALL = new Continuous(
|
||||
name("recap.engagement_predicted.dwell_normalized_overall"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_DWELL_CDF =
|
||||
new Continuous(name("recap.engagement_predicted.dwell_cdf"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_DWELL_CDF_OVERALL = new Continuous(
|
||||
name("recap.engagement_predicted.dwell_cdf_overall"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_DWELLED =
|
||||
new Continuous(name("recap.engagement_predicted.is_dwelled"), Set(EngagementScore).asJava)
|
||||
|
||||
val PREDICTED_IS_DWELLED_1S =
|
||||
new Continuous(name("recap.engagement_predicted.is_dwelled_1s"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_DWELLED_2S =
|
||||
new Continuous(name("recap.engagement_predicted.is_dwelled_2s"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_DWELLED_3S =
|
||||
new Continuous(name("recap.engagement_predicted.is_dwelled_3s"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_DWELLED_4S =
|
||||
new Continuous(name("recap.engagement_predicted.is_dwelled_4s"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_DWELLED_5S =
|
||||
new Continuous(name("recap.engagement_predicted.is_dwelled_5s"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_DWELLED_6S =
|
||||
new Continuous(name("recap.engagement_predicted.is_dwelled_6s"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_DWELLED_7S =
|
||||
new Continuous(name("recap.engagement_predicted.is_dwelled_7s"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_DWELLED_8S =
|
||||
new Continuous(name("recap.engagement_predicted.is_dwelled_8s"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_DWELLED_9S =
|
||||
new Continuous(name("recap.engagement_predicted.is_dwelled_9s"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_DWELLED_10S =
|
||||
new Continuous(name("recap.engagement_predicted.is_dwelled_10s"), Set(EngagementScore).asJava)
|
||||
|
||||
val PREDICTED_IS_SKIPPED_1S =
|
||||
new Continuous(name("recap.engagement_predicted.is_skipped_1s"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_SKIPPED_2S =
|
||||
new Continuous(name("recap.engagement_predicted.is_skipped_2s"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_SKIPPED_3S =
|
||||
new Continuous(name("recap.engagement_predicted.is_skipped_3s"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_SKIPPED_4S =
|
||||
new Continuous(name("recap.engagement_predicted.is_skipped_4s"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_SKIPPED_5S =
|
||||
new Continuous(name("recap.engagement_predicted.is_skipped_5s"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_SKIPPED_6S =
|
||||
new Continuous(name("recap.engagement_predicted.is_skipped_6s"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_SKIPPED_7S =
|
||||
new Continuous(name("recap.engagement_predicted.is_skipped_7s"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_SKIPPED_8S =
|
||||
new Continuous(name("recap.engagement_predicted.is_skipped_8s"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_SKIPPED_9S =
|
||||
new Continuous(name("recap.engagement_predicted.is_skipped_9s"), Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_SKIPPED_10S =
|
||||
new Continuous(name("recap.engagement_predicted.is_skipped_10s"), Set(EngagementScore).asJava)
|
||||
|
||||
val PREDICTED_IS_HOME_LATEST_VISITED = new Continuous(
|
||||
name("recap.engagement_predicted.is_home_latest_visited"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_NEGATIVE_FEEDBACK =
|
||||
new Continuous(
|
||||
name("recap.engagement_predicted.is_negative_feedback"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_NEGATIVE_FEEDBACK_V2 =
|
||||
new Continuous(
|
||||
name("recap.engagement_predicted.is_negative_feedback_v2"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_WEAK_NEGATIVE_FEEDBACK =
|
||||
new Continuous(
|
||||
name("recap.engagement_predicted.is_weak_negative_feedback"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_STRONG_NEGATIVE_FEEDBACK =
|
||||
new Continuous(
|
||||
name("recap.engagement_predicted.is_strong_negative_feedback"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_REPORT_TWEET_CLICKED =
|
||||
new Continuous(
|
||||
name("recap.engagement_predicted.is_report_tweet_clicked"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_UNFOLLOW_TOPIC =
|
||||
new Continuous(
|
||||
name("recap.engagement_predicted.is_unfollow_topic"),
|
||||
Set(EngagementScore).asJava)
|
||||
val PREDICTED_IS_RELEVANCE_PROMPT_YES_CLICKED = new Continuous(
|
||||
name("recap.engagement_predicted.is_relevance_prompt_yes_clicked"),
|
||||
Set(EngagementScore).asJava)
|
||||
|
||||
// engagement for following user from any surface area
|
||||
val PREDICTED_IS_FOLLOWED_FROM_ANY_SURFACE_AREA = new Continuous(
|
||||
"recap.engagement_predicted.is_followed_from_any_surface_area",
|
||||
Set(EngagementScore).asJava)
|
||||
|
||||
|
||||
// These are global engagement counts for the Tweets.
|
||||
val FAV_COUNT_V2 = new Continuous(
|
||||
name("recap.earlybird.fav_count_v2"),
|
||||
Set(CountOfPrivateLikes, CountOfPublicLikes).asJava)
|
||||
val RETWEET_COUNT_V2 = new Continuous(
|
||||
name("recap.earlybird.retweet_count_v2"),
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava)
|
||||
val REPLY_COUNT_V2 = new Continuous(
|
||||
name("recap.earlybird.reply_count_v2"),
|
||||
Set(CountOfPrivateReplies, CountOfPublicReplies).asJava)
|
||||
|
||||
val HAS_US_POLITICAL_ANNOTATION = new Binary(
|
||||
name("recap.has_us_political_annotation"),
|
||||
Set(SemanticcoreClassification).asJava
|
||||
)
|
||||
|
||||
val HAS_US_POLITICAL_ALL_GROUPS_ANNOTATION = new Binary(
|
||||
name("recap.has_us_political_all_groups_annotation"),
|
||||
Set(SemanticcoreClassification).asJava
|
||||
)
|
||||
|
||||
val HAS_US_POLITICAL_ANNOTATION_HIGH_RECALL = new Binary(
|
||||
name("recap.has_us_political_annotation_high_recall"),
|
||||
Set(SemanticcoreClassification).asJava
|
||||
)
|
||||
|
||||
val HAS_US_POLITICAL_ANNOTATION_HIGH_RECALL_V2 = new Binary(
|
||||
name("recap.has_us_political_annotation_high_recall_v2"),
|
||||
Set(SemanticcoreClassification).asJava
|
||||
)
|
||||
|
||||
val HAS_US_POLITICAL_ANNOTATION_HIGH_PRECISION_V0 = new Binary(
|
||||
name("recap.has_us_political_annotation_high_precision_v0"),
|
||||
Set(SemanticcoreClassification).asJava
|
||||
)
|
||||
|
||||
val HAS_US_POLITICAL_ANNOTATION_BALANCED_PRECISION_RECALL_V0 = new Binary(
|
||||
name("recap.has_us_political_annotation_balanced_precision_recall_v0"),
|
||||
Set(SemanticcoreClassification).asJava
|
||||
)
|
||||
|
||||
val HAS_US_POLITICAL_ANNOTATION_HIGH_RECALL_V3 = new Binary(
|
||||
name("recap.has_us_political_annotation_high_recall_v3"),
|
||||
Set(SemanticcoreClassification).asJava
|
||||
)
|
||||
|
||||
val HAS_US_POLITICAL_ANNOTATION_HIGH_PRECISION_V3 = new Binary(
|
||||
name("recap.has_us_political_annotation_high_precision_v3"),
|
||||
Set(SemanticcoreClassification).asJava
|
||||
)
|
||||
|
||||
val HAS_US_POLITICAL_ANNOTATION_BALANCED_V3 = new Binary(
|
||||
name("recap.has_us_political_annotation_balanced_v3"),
|
||||
Set(SemanticcoreClassification).asJava
|
||||
)
|
||||
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
package com.twitter.timelines.prediction.features.recap
|
||||
|
||||
object RecapFeaturesUtils {
|
||||
// This needs to be updated if an engagement model is added or removed from prediction service.
|
||||
val scoreFeatureIdsMap: Map[String, Long] = Map(
|
||||
RecapFeatures.IS_FAVORITED.getFeatureName -> RecapFeatures.PREDICTED_IS_FAVORITED.getFeatureId,
|
||||
RecapFeatures.IS_REPLIED.getFeatureName -> RecapFeatures.PREDICTED_IS_REPLIED.getFeatureId,
|
||||
RecapFeatures.IS_RETWEETED.getFeatureName -> RecapFeatures.PREDICTED_IS_RETWEETED.getFeatureId,
|
||||
RecapFeatures.IS_GOOD_CLICKED_CONVO_DESC_V1.getFeatureName -> RecapFeatures.PREDICTED_IS_GOOD_CLICKED_V1.getFeatureId,
|
||||
RecapFeatures.IS_GOOD_CLICKED_CONVO_DESC_V2.getFeatureName -> RecapFeatures.PREDICTED_IS_GOOD_CLICKED_V2.getFeatureId,
|
||||
// RecapFeatures.IS_NEGATIVE_FEEDBACK_V2.getFeatureName -> RecapFeatures.PREDICTED_IS_NEGATIVE_FEEDBACK_V2.getFeatureId,
|
||||
RecapFeatures.IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED.getFeatureName -> RecapFeatures.PREDICTED_IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED.getFeatureId,
|
||||
RecapFeatures.IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR.getFeatureName -> RecapFeatures.PREDICTED_IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR.getFeatureId
|
||||
)
|
||||
|
||||
// This needs to be updated if an engagement model is added or removed from prediction service.
|
||||
val labelFeatureIdToScoreFeatureIdsMap: Map[Long, Long] = Map(
|
||||
RecapFeatures.IS_FAVORITED.getFeatureId -> RecapFeatures.PREDICTED_IS_FAVORITED.getFeatureId,
|
||||
RecapFeatures.IS_REPLIED.getFeatureId -> RecapFeatures.PREDICTED_IS_REPLIED.getFeatureId,
|
||||
RecapFeatures.IS_RETWEETED.getFeatureId -> RecapFeatures.PREDICTED_IS_RETWEETED.getFeatureId,
|
||||
RecapFeatures.IS_GOOD_CLICKED_CONVO_DESC_V1.getFeatureId -> RecapFeatures.PREDICTED_IS_GOOD_CLICKED_V1.getFeatureId,
|
||||
RecapFeatures.IS_GOOD_CLICKED_CONVO_DESC_V2.getFeatureId -> RecapFeatures.PREDICTED_IS_GOOD_CLICKED_V2.getFeatureId,
|
||||
// RecapFeatures.IS_NEGATIVE_FEEDBACK_V2.getFeatureName -> RecapFeatures.PREDICTED_IS_NEGATIVE_FEEDBACK_V2.getFeatureId,
|
||||
RecapFeatures.IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED.getFeatureId -> RecapFeatures.PREDICTED_IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED.getFeatureId,
|
||||
RecapFeatures.IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR.getFeatureId -> RecapFeatures.PREDICTED_IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR.getFeatureId
|
||||
)
|
||||
|
||||
val labelFeatureNames: Seq[String] = scoreFeatureIdsMap.keys.toSeq
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,57 @@
|
|||
package com.twitter.timelines.prediction.features.request_context
|
||||
|
||||
import com.twitter.ml.api.FeatureContext
|
||||
import com.twitter.ml.api.Feature._
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType._
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object RequestContextFeatures {
|
||||
val COUNTRY_CODE =
|
||||
new Text("request_context.country_code", Set(PrivateCountryOrRegion, InferredCountry).asJava)
|
||||
val LANGUAGE_CODE = new Text(
|
||||
"request_context.language_code",
|
||||
Set(GeneralSettings, ProvidedLanguage, InferredLanguage).asJava)
|
||||
val REQUEST_PROVENANCE = new Text("request_context.request_provenance", Set(AppUsage).asJava)
|
||||
val DISPLAY_WIDTH = new Continuous("request_context.display_width", Set(OtherDeviceInfo).asJava)
|
||||
val DISPLAY_HEIGHT = new Continuous("request_context.display_height", Set(OtherDeviceInfo).asJava)
|
||||
val DISPLAY_DPI = new Continuous("request_context.display_dpi", Set(OtherDeviceInfo).asJava)
|
||||
|
||||
// the following features are not Continuous Features because for e.g. continuity between
|
||||
// 23 and 0 hours cannot be handled that way. instead, we will treat each slice of hours/days
|
||||
// independently, like a set of sparse binary features.
|
||||
val TIMESTAMP_GMT_HOUR =
|
||||
new Discrete("request_context.timestamp_gmt_hour", Set(PrivateTimestamp).asJava)
|
||||
val TIMESTAMP_GMT_DOW =
|
||||
new Discrete("request_context.timestamp_gmt_dow", Set(PrivateTimestamp).asJava)
|
||||
|
||||
val IS_GET_INITIAL = new Binary("request_context.is_get_initial")
|
||||
val IS_GET_MIDDLE = new Binary("request_context.is_get_middle")
|
||||
val IS_GET_NEWER = new Binary("request_context.is_get_newer")
|
||||
val IS_GET_OLDER = new Binary("request_context.is_get_older")
|
||||
|
||||
// the following features are not Binary Features because the source field is Option[Boolean],
|
||||
// and we want to distinguish Some(false) from None. None will be converted to -1.
|
||||
val IS_POLLING = new Discrete("request_context.is_polling")
|
||||
val IS_SESSION_START = new Discrete("request_context.is_session_start")
|
||||
|
||||
// Helps distinguish requests from "home" vs "home_latest" (reverse chron home view).
|
||||
val TIMELINE_KIND = new Text("request_context.timeline_kind")
|
||||
|
||||
val featureContext = new FeatureContext(
|
||||
COUNTRY_CODE,
|
||||
LANGUAGE_CODE,
|
||||
REQUEST_PROVENANCE,
|
||||
DISPLAY_WIDTH,
|
||||
DISPLAY_HEIGHT,
|
||||
DISPLAY_DPI,
|
||||
TIMESTAMP_GMT_HOUR,
|
||||
TIMESTAMP_GMT_DOW,
|
||||
IS_GET_INITIAL,
|
||||
IS_GET_MIDDLE,
|
||||
IS_GET_NEWER,
|
||||
IS_GET_OLDER,
|
||||
IS_POLLING,
|
||||
IS_SESSION_START,
|
||||
TIMELINE_KIND
|
||||
)
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
"src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
|
||||
"src/thrift/com/twitter/timelines/suggests/common:record-scala",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
"timelines/data_processing/ml_util/aggregation_framework/conversion:for-timelines",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,61 @@
|
|||
package com.twitter.timelines.prediction.features.simcluster
|
||||
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType._
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.ml.api.Feature._
|
||||
import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
class SimclusterFeaturesHelper(statsReceiver: StatsReceiver) {
|
||||
import SimclusterFeatures._
|
||||
|
||||
private[this] val scopedStatsReceiver = statsReceiver.scope(getClass.getSimpleName)
|
||||
private[this] val invalidSimclusterModelVersion = scopedStatsReceiver
|
||||
.counter("invalidSimclusterModelVersion")
|
||||
|
||||
def fromUserClusterInterestsPair(
|
||||
userInterestClustersPair: (Long, ClustersUserIsInterestedIn)
|
||||
): Option[SimclusterFeatures] = {
|
||||
val (userId, userInterestClusters) = userInterestClustersPair
|
||||
if (userInterestClusters.knownForModelVersion == SIMCLUSTER_MODEL_VERSION) {
|
||||
val userInterestClustersFavScores = for {
|
||||
(clusterId, scores) <- userInterestClusters.clusterIdToScores
|
||||
favScore <- scores.favScore
|
||||
} yield (clusterId.toString, favScore)
|
||||
Some(
|
||||
SimclusterFeatures(
|
||||
userId,
|
||||
userInterestClusters.knownForModelVersion,
|
||||
userInterestClustersFavScores.toMap
|
||||
)
|
||||
)
|
||||
} else {
|
||||
// We maintain this counter to make sure that the hardcoded modelVersion we are using is correct.
|
||||
invalidSimclusterModelVersion.incr
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
object SimclusterFeatures {
|
||||
// Check http://go/simclustersv2runbook for production versions
|
||||
// Our models are trained for this specific model version only.
|
||||
val SIMCLUSTER_MODEL_VERSION = "20M_145K_dec11"
|
||||
val prefix = s"simcluster.v2.$SIMCLUSTER_MODEL_VERSION"
|
||||
|
||||
val SIMCLUSTER_USER_INTEREST_CLUSTER_SCORES = new SparseContinuous(
|
||||
s"$prefix.user_interest_cluster_scores",
|
||||
Set(EngagementScore, InferredInterests).asJava
|
||||
)
|
||||
val SIMCLUSTER_USER_INTEREST_CLUSTER_IDS = new SparseBinary(
|
||||
s"$prefix.user_interest_cluster_ids",
|
||||
Set(InferredInterests).asJava
|
||||
)
|
||||
val SIMCLUSTER_MODEL_VERSION_METADATA = new Text("meta.simcluster_version")
|
||||
}
|
||||
|
||||
case class SimclusterFeatures(
|
||||
userId: Long,
|
||||
modelVersion: String,
|
||||
interestClusterScoresMap: Map[String, Double])
|
|
@ -0,0 +1,150 @@
|
|||
package com.twitter.timelines.prediction.features.simcluster
|
||||
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType._
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.ml.api.{Feature, FeatureContext}
|
||||
import com.twitter.ml.api.Feature.{Continuous, SparseBinary, SparseContinuous}
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion._
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
|
||||
import com.twitter.timelines.suggests.common.record.thriftscala.SuggestionRecord
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
class SimclusterTweetFeatures(statsReceiver: StatsReceiver) extends CombineCountsBase {
|
||||
import SimclusterTweetFeatures._
|
||||
|
||||
private[this] val scopedStatsReceiver = statsReceiver.scope(getClass.getSimpleName)
|
||||
private[this] val invalidSimclusterModelVersion = scopedStatsReceiver
|
||||
.counter("invalidSimclusterModelVersion")
|
||||
private[this] val getFeaturesFromOverlappingSimclusterIdsCount = scopedStatsReceiver
|
||||
.counter("getFeaturesFromOverlappingSimclusterIdsCount")
|
||||
private[this] val emptySimclusterMaps = scopedStatsReceiver
|
||||
.counter("emptySimclusterMaps")
|
||||
private[this] val nonOverlappingSimclusterMaps = scopedStatsReceiver
|
||||
.counter("nonOverlappingSimclusterMaps")
|
||||
|
||||
// Parameters required by CombineCountsBase
|
||||
override val topK: Int = 5
|
||||
override val hardLimit: Option[Int] = None
|
||||
override val precomputedCountFeatures: Seq[Feature[_]] = Seq(
|
||||
SIMCLUSTER_TWEET_TOPK_SORT_BY_TWEET_SCORE,
|
||||
SIMCLUSTER_TWEET_TOPK_SORT_BY_COMBINED_SCORE
|
||||
)
|
||||
|
||||
private def getFeaturesFromOverlappingSimclusterIds(
|
||||
userSimclustersInterestedInMap: Map[String, Double],
|
||||
tweetSimclustersTopKMap: Map[String, Double]
|
||||
): Map[Feature[_], List[Double]] = {
|
||||
getFeaturesFromOverlappingSimclusterIdsCount.incr
|
||||
if (userSimclustersInterestedInMap.isEmpty || tweetSimclustersTopKMap.isEmpty) {
|
||||
emptySimclusterMaps.incr
|
||||
Map.empty
|
||||
} else {
|
||||
val overlappingSimclusterIds =
|
||||
userSimclustersInterestedInMap.keySet intersect tweetSimclustersTopKMap.keySet
|
||||
if (overlappingSimclusterIds.isEmpty) {
|
||||
nonOverlappingSimclusterMaps.incr
|
||||
Map.empty
|
||||
} else {
|
||||
val (combinedScores, tweetScores) = overlappingSimclusterIds.map { id =>
|
||||
val tweetScore = tweetSimclustersTopKMap.getOrElse(id, 0.0)
|
||||
val combinedScore = userSimclustersInterestedInMap.getOrElse(id, 0.0) * tweetScore
|
||||
(combinedScore, tweetScore)
|
||||
}.unzip
|
||||
Map(
|
||||
SIMCLUSTER_TWEET_TOPK_SORT_BY_COMBINED_SCORE -> combinedScores.toList,
|
||||
SIMCLUSTER_TWEET_TOPK_SORT_BY_TWEET_SCORE -> tweetScores.toList
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def getCountFeaturesValuesMap(
|
||||
suggestionRecord: SuggestionRecord,
|
||||
simclustersTweetTopKMap: Map[String, Double]
|
||||
): Map[Feature[_], List[Double]] = {
|
||||
val userSimclustersInterestedInMap = formatUserSimclustersInterestedIn(suggestionRecord)
|
||||
|
||||
val tweetSimclustersTopKMap = formatTweetSimclustersTopK(simclustersTweetTopKMap)
|
||||
|
||||
getFeaturesFromOverlappingSimclusterIds(userSimclustersInterestedInMap, tweetSimclustersTopKMap)
|
||||
}
|
||||
|
||||
def filterByModelVersion(
|
||||
simclustersMapOpt: Option[Map[String, Double]]
|
||||
): Option[Map[String, Double]] = {
|
||||
simclustersMapOpt.flatMap { simclustersMap =>
|
||||
val filteredSimclustersMap = simclustersMap.filter {
|
||||
case (clusterId, score) =>
|
||||
// The clusterId format is ModelVersion.IntegerClusterId.ScoreType as specified at
|
||||
// com.twitter.ml.featurestore.catalog.features.recommendations.SimClustersV2TweetTopClusters
|
||||
clusterId.contains(SimclusterFeatures.SIMCLUSTER_MODEL_VERSION)
|
||||
}
|
||||
|
||||
// The assumption is that the simclustersMap will contain clusterIds with the same modelVersion.
|
||||
// We maintain this counter to make sure that the hardcoded modelVersion we are using is correct.
|
||||
if (simclustersMap.size > filteredSimclustersMap.size) {
|
||||
invalidSimclusterModelVersion.incr
|
||||
}
|
||||
|
||||
if (filteredSimclustersMap.nonEmpty) Some(filteredSimclustersMap) else None
|
||||
}
|
||||
}
|
||||
|
||||
val allFeatures: Seq[Feature[_]] = outputFeaturesPostMerge.toSeq ++ Seq(
|
||||
SIMCLUSTER_TWEET_TOPK_CLUSTER_IDS,
|
||||
SIMCLUSTER_TWEET_TOPK_CLUSTER_SCORES)
|
||||
val featureContext = new FeatureContext(allFeatures: _*)
|
||||
}
|
||||
|
||||
object SimclusterTweetFeatures {
|
||||
val SIMCLUSTER_TWEET_TOPK_CLUSTER_IDS = new SparseBinary(
|
||||
s"${SimclusterFeatures.prefix}.tweet_topk_cluster_ids",
|
||||
Set(InferredInterests).asJava
|
||||
)
|
||||
val SIMCLUSTER_TWEET_TOPK_CLUSTER_SCORES = new SparseContinuous(
|
||||
s"${SimclusterFeatures.prefix}.tweet_topk_cluster_scores",
|
||||
Set(EngagementScore, InferredInterests).asJava
|
||||
)
|
||||
|
||||
val SIMCLUSTER_TWEET_TOPK_CLUSTER_ID =
|
||||
TypedAggregateGroup.sparseFeature(SIMCLUSTER_TWEET_TOPK_CLUSTER_IDS)
|
||||
|
||||
val SIMCLUSTER_TWEET_TOPK_SORT_BY_TWEET_SCORE = new Continuous(
|
||||
s"${SimclusterFeatures.prefix}.tweet_topk_sort_by_tweet_score",
|
||||
Set(EngagementScore, InferredInterests).asJava
|
||||
)
|
||||
|
||||
val SIMCLUSTER_TWEET_TOPK_SORT_BY_COMBINED_SCORE = new Continuous(
|
||||
s"${SimclusterFeatures.prefix}.tweet_topk_sort_by_combined_score",
|
||||
Set(EngagementScore, InferredInterests).asJava
|
||||
)
|
||||
|
||||
def formatUserSimclustersInterestedIn(suggestionRecord: SuggestionRecord): Map[String, Double] = {
|
||||
suggestionRecord.userSimclustersInterestedIn
|
||||
.map { clustersUserIsInterestedIn =>
|
||||
if (clustersUserIsInterestedIn.knownForModelVersion == SimclusterFeatures.SIMCLUSTER_MODEL_VERSION) {
|
||||
clustersUserIsInterestedIn.clusterIdToScores.collect {
|
||||
case (clusterId, scores) if scores.favScore.isDefined =>
|
||||
(clusterId.toString, scores.favScore.get)
|
||||
}
|
||||
} else Map.empty[String, Double]
|
||||
}.getOrElse(Map.empty[String, Double])
|
||||
.toMap
|
||||
}
|
||||
|
||||
def formatTweetSimclustersTopK(
|
||||
simclustersTweetTopKMap: Map[String, Double]
|
||||
): Map[String, Double] = {
|
||||
simclustersTweetTopKMap.collect {
|
||||
case (clusterId, score) =>
|
||||
// The clusterId format is <ModelVersion.IntegerClusterId.ScoreType> as specified at
|
||||
// com.twitter.ml.featurestore.catalog.features.recommendations.SimClustersV2TweetTopClusters
|
||||
// and we want to extract the IntegerClusterId.
|
||||
// The split function takes a regex; therefore, we need to escape . and we also need to escape
|
||||
// \ since they are both special characters. Hence, the double \\.
|
||||
val clusterIdSplit = clusterId.split("\\.")
|
||||
val integerClusterId = clusterIdSplit(1) // The IntegerClusterId is at position 1.
|
||||
(integerClusterId, score)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
package com.twitter.timelines.prediction.features.simcluster
|
||||
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType.SemanticcoreClassification
|
||||
import com.twitter.ml.api.Feature
|
||||
import com.twitter.ml.api.Feature.Continuous
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion.CombineCountsBase
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object SimclustersScoresFeatures extends CombineCountsBase {
|
||||
override def topK: Int = 2
|
||||
|
||||
override def hardLimit: Option[Int] = Some(20)
|
||||
|
||||
val prefix = s"recommendations.sim_clusters_scores"
|
||||
val TOPIC_CONSUMER_TWEET_EMBEDDING_Cs = new Continuous(
|
||||
s"$prefix.localized_topic_consumer_tweet_embedding_cosine_similarity",
|
||||
Set(SemanticcoreClassification).asJava)
|
||||
val TOPIC_PRODUCER_TWEET_EMBEDDING_Cs = new Continuous(
|
||||
s"$prefix.topic_producer_tweet_embedding_cosine_similarity",
|
||||
Set(SemanticcoreClassification).asJava)
|
||||
val USER_TOPIC_CONSUMER_TWEET_EMBEDDING_COSINE_SIM = new Continuous(
|
||||
s"$prefix.user_interested_in_localized_topic_consumer_embedding_cosine_similarity",
|
||||
Set(SemanticcoreClassification).asJava)
|
||||
val USER_TOPIC_CONSUMER_TWEET_EMBEDDING_DOT_PRODUCT = new Continuous(
|
||||
s"$prefix.user_interested_in_localized_topic_consumer_embedding_dot_product",
|
||||
Set(SemanticcoreClassification).asJava)
|
||||
val USER_TOPIC_PRODUCER_TWEET_EMBEDDING_COSINE_SIM = new Continuous(
|
||||
s"$prefix.user_interested_in_localized_topic_producer_embedding_cosine_similarity",
|
||||
Set(SemanticcoreClassification).asJava)
|
||||
val USER_TOPIC_PRODUCER_TWEET_EMBEDDING_DOT_PRODUCT = new Continuous(
|
||||
s"$prefix.user_interested_in_localized_topic_producer_embedding_dot_product",
|
||||
Set(SemanticcoreClassification).asJava)
|
||||
|
||||
override def precomputedCountFeatures: Seq[Feature[_]] =
|
||||
Seq(
|
||||
TOPIC_CONSUMER_TWEET_EMBEDDING_Cs,
|
||||
TOPIC_PRODUCER_TWEET_EMBEDDING_Cs,
|
||||
USER_TOPIC_CONSUMER_TWEET_EMBEDDING_COSINE_SIM,
|
||||
USER_TOPIC_CONSUMER_TWEET_EMBEDDING_DOT_PRODUCT,
|
||||
USER_TOPIC_PRODUCER_TWEET_EMBEDDING_COSINE_SIM,
|
||||
USER_TOPIC_PRODUCER_TWEET_EMBEDDING_DOT_PRODUCT
|
||||
)
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
scala_library(
|
||||
name = "socialproof_features",
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/ibm/icu:icu4j",
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/scala/com/twitter/ml/api/util",
|
||||
"src/scala/com/twitter/timelines/util",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
"src/thrift/com/twitter/ml/api:data-java",
|
||||
"src/thrift/com/twitter/timelines/socialproof:socialproof-scala",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,172 @@
|
|||
package com.twitter.timelines.prediction.features.socialproof
|
||||
|
||||
import com.twitter.ml.api.DataRecord
|
||||
import com.twitter.ml.api.Feature.Binary
|
||||
import com.twitter.ml.api.Feature.Continuous
|
||||
import com.twitter.ml.api.Feature.SparseBinary
|
||||
import com.twitter.ml.api.util.FDsl._
|
||||
import com.twitter.timelines.prediction.features.socialproof.SocialProofDataRecordFeatures._
|
||||
import com.twitter.timelines.socialproof.thriftscala.SocialProof
|
||||
import com.twitter.timelines.socialproof.v1.thriftscala.SocialProofType
|
||||
import com.twitter.timelines.util.CommonTypes.UserId
|
||||
import scala.collection.JavaConverters._
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType._
|
||||
|
||||
abstract class SocialProofUserGroundTruth(userIds: Seq[UserId], count: Int) {
|
||||
require(
|
||||
count >= userIds.size,
|
||||
"count must be equal to or greater than the number of entries in userIds"
|
||||
)
|
||||
// Using Double as the return type to make it more convenient for these values to be used as
|
||||
// ML feature values.
|
||||
val displayedUserCount: Double = userIds.size.toDouble
|
||||
val undisplayedUserCount: Double = count - userIds.size.toDouble
|
||||
val totalCount: Double = count.toDouble
|
||||
|
||||
def featureDisplayedUsers: SparseBinary
|
||||
def featureDisplayedUserCount: Continuous
|
||||
def featureUndisplayedUserCount: Continuous
|
||||
def featureTotalUserCount: Continuous
|
||||
|
||||
def setFeatures(rec: DataRecord): Unit = {
|
||||
rec.setFeatureValue(featureDisplayedUsers, toStringSet(userIds))
|
||||
rec.setFeatureValue(featureDisplayedUserCount, displayedUserCount)
|
||||
rec.setFeatureValue(featureUndisplayedUserCount, undisplayedUserCount)
|
||||
rec.setFeatureValue(featureTotalUserCount, totalCount)
|
||||
}
|
||||
protected def toStringSet(value: Seq[Long]): Set[String] = {
|
||||
value.map(_.toString).toSet
|
||||
}
|
||||
}
|
||||
|
||||
case class FavoritedBySocialProofUserGroundTruth(userIds: Seq[UserId] = Seq.empty, count: Int = 0)
|
||||
extends SocialProofUserGroundTruth(userIds, count) {
|
||||
|
||||
override val featureDisplayedUsers = SocialProofDisplayedFavoritedByUsers
|
||||
override val featureDisplayedUserCount = SocialProofDisplayedFavoritedByUserCount
|
||||
override val featureUndisplayedUserCount = SocialProofUndisplayedFavoritedByUserCount
|
||||
override val featureTotalUserCount = SocialProofTotalFavoritedByUserCount
|
||||
}
|
||||
|
||||
case class RetweetedBySocialProofUserGroundTruth(userIds: Seq[UserId] = Seq.empty, count: Int = 0)
|
||||
extends SocialProofUserGroundTruth(userIds, count) {
|
||||
|
||||
override val featureDisplayedUsers = SocialProofDisplayedRetweetedByUsers
|
||||
override val featureDisplayedUserCount = SocialProofDisplayedRetweetedByUserCount
|
||||
override val featureUndisplayedUserCount = SocialProofUndisplayedRetweetedByUserCount
|
||||
override val featureTotalUserCount = SocialProofTotalRetweetedByUserCount
|
||||
}
|
||||
|
||||
case class RepliedBySocialProofUserGroundTruth(userIds: Seq[UserId] = Seq.empty, count: Int = 0)
|
||||
extends SocialProofUserGroundTruth(userIds, count) {
|
||||
|
||||
override val featureDisplayedUsers = SocialProofDisplayedRepliedByUsers
|
||||
override val featureDisplayedUserCount = SocialProofDisplayedRepliedByUserCount
|
||||
override val featureUndisplayedUserCount = SocialProofUndisplayedRepliedByUserCount
|
||||
override val featureTotalUserCount = SocialProofTotalRepliedByUserCount
|
||||
}
|
||||
|
||||
case class SocialProofFeatures(
|
||||
hasSocialProof: Boolean,
|
||||
favoritedBy: FavoritedBySocialProofUserGroundTruth = FavoritedBySocialProofUserGroundTruth(),
|
||||
retweetedBy: RetweetedBySocialProofUserGroundTruth = RetweetedBySocialProofUserGroundTruth(),
|
||||
repliedBy: RepliedBySocialProofUserGroundTruth = RepliedBySocialProofUserGroundTruth()) {
|
||||
|
||||
def setFeatures(dataRecord: DataRecord): Unit =
|
||||
if (hasSocialProof) {
|
||||
dataRecord.setFeatureValue(HasSocialProof, hasSocialProof)
|
||||
favoritedBy.setFeatures(dataRecord)
|
||||
retweetedBy.setFeatures(dataRecord)
|
||||
repliedBy.setFeatures(dataRecord)
|
||||
}
|
||||
}
|
||||
|
||||
object SocialProofFeatures {
|
||||
def apply(socialProofs: Seq[SocialProof]): SocialProofFeatures =
|
||||
socialProofs.foldLeft(SocialProofFeatures(hasSocialProof = socialProofs.nonEmpty))(
|
||||
(prevFeatures, socialProof) => {
|
||||
val userIds = socialProof.v1.userIds
|
||||
val count = socialProof.v1.count
|
||||
socialProof.v1.socialProofType match {
|
||||
case SocialProofType.FavoritedBy =>
|
||||
prevFeatures.copy(favoritedBy = FavoritedBySocialProofUserGroundTruth(userIds, count))
|
||||
case SocialProofType.RetweetedBy =>
|
||||
prevFeatures.copy(retweetedBy = RetweetedBySocialProofUserGroundTruth(userIds, count))
|
||||
case SocialProofType.RepliedBy =>
|
||||
prevFeatures.copy(repliedBy = RepliedBySocialProofUserGroundTruth(userIds, count))
|
||||
case _ =>
|
||||
prevFeatures // skip silently instead of breaking jobs, since this isn't used yet
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
object SocialProofDataRecordFeatures {
|
||||
val HasSocialProof = new Binary("recap.social_proof.has_social_proof")
|
||||
|
||||
val SocialProofDisplayedFavoritedByUsers = new SparseBinary(
|
||||
"recap.social_proof.list.displayed.favorited_by",
|
||||
Set(UserId, PublicLikes, PrivateLikes).asJava
|
||||
)
|
||||
val SocialProofDisplayedFavoritedByUserCount = new Continuous(
|
||||
"recap.social_proof.count.displayed.favorited_by",
|
||||
Set(CountOfPrivateLikes, CountOfPublicLikes).asJava
|
||||
)
|
||||
val SocialProofUndisplayedFavoritedByUserCount = new Continuous(
|
||||
"recap.social_proof.count.undisplayed.favorited_by",
|
||||
Set(CountOfPrivateLikes, CountOfPublicLikes).asJava
|
||||
)
|
||||
val SocialProofTotalFavoritedByUserCount = new Continuous(
|
||||
"recap.social_proof.count.total.favorited_by",
|
||||
Set(CountOfPrivateLikes, CountOfPublicLikes).asJava
|
||||
)
|
||||
|
||||
val SocialProofDisplayedRetweetedByUsers = new SparseBinary(
|
||||
"recap.social_proof.list.displayed.retweeted_by",
|
||||
Set(UserId, PublicRetweets, PrivateRetweets).asJava
|
||||
)
|
||||
val SocialProofDisplayedRetweetedByUserCount = new Continuous(
|
||||
"recap.social_proof.count.displayed.retweeted_by",
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava
|
||||
)
|
||||
val SocialProofUndisplayedRetweetedByUserCount = new Continuous(
|
||||
"recap.social_proof.count.undisplayed.retweeted_by",
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava
|
||||
)
|
||||
val SocialProofTotalRetweetedByUserCount = new Continuous(
|
||||
"recap.social_proof.count.total.retweeted_by",
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava
|
||||
)
|
||||
|
||||
val SocialProofDisplayedRepliedByUsers = new SparseBinary(
|
||||
"recap.social_proof.list.displayed.replied_by",
|
||||
Set(UserId, PublicReplies, PrivateReplies).asJava
|
||||
)
|
||||
val SocialProofDisplayedRepliedByUserCount = new Continuous(
|
||||
"recap.social_proof.count.displayed.replied_by",
|
||||
Set(CountOfPrivateReplies, CountOfPublicReplies).asJava
|
||||
)
|
||||
val SocialProofUndisplayedRepliedByUserCount = new Continuous(
|
||||
"recap.social_proof.count.undisplayed.replied_by",
|
||||
Set(CountOfPrivateReplies, CountOfPublicReplies).asJava
|
||||
)
|
||||
val SocialProofTotalRepliedByUserCount = new Continuous(
|
||||
"recap.social_proof.count.total.replied_by",
|
||||
Set(CountOfPrivateReplies, CountOfPublicReplies).asJava
|
||||
)
|
||||
|
||||
val AllFeatures = Seq(
|
||||
HasSocialProof,
|
||||
SocialProofDisplayedFavoritedByUsers,
|
||||
SocialProofDisplayedFavoritedByUserCount,
|
||||
SocialProofUndisplayedFavoritedByUserCount,
|
||||
SocialProofTotalFavoritedByUserCount,
|
||||
SocialProofDisplayedRetweetedByUsers,
|
||||
SocialProofDisplayedRetweetedByUserCount,
|
||||
SocialProofUndisplayedRetweetedByUserCount,
|
||||
SocialProofTotalRetweetedByUserCount,
|
||||
SocialProofDisplayedRepliedByUsers,
|
||||
SocialProofDisplayedRepliedByUserCount,
|
||||
SocialProofUndisplayedRepliedByUserCount,
|
||||
SocialProofTotalRepliedByUserCount
|
||||
)
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
"src/thrift/com/twitter/timelines/time_features:time_features-scala",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,111 @@
|
|||
package com.twitter.timelines.prediction.features.time_features
|
||||
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType._
|
||||
import com.twitter.ml.api.Feature._
|
||||
import scala.collection.JavaConverters._
|
||||
import com.twitter.util.Duration
|
||||
import com.twitter.conversions.DurationOps._
|
||||
|
||||
object TimeDataRecordFeatures {
|
||||
val TIME_BETWEEN_NON_POLLING_REQUESTS_AVG = new Continuous(
|
||||
"time_features.time_between_non_polling_requests_avg",
|
||||
Set(PrivateTimestamp).asJava
|
||||
)
|
||||
val TIME_SINCE_TWEET_CREATION = new Continuous("time_features.time_since_tweet_creation")
|
||||
val TIME_SINCE_SOURCE_TWEET_CREATION = new Continuous(
|
||||
"time_features.time_since_source_tweet_creation"
|
||||
)
|
||||
val TIME_SINCE_LAST_NON_POLLING_REQUEST = new Continuous(
|
||||
"time_features.time_since_last_non_polling_request",
|
||||
Set(PrivateTimestamp).asJava
|
||||
)
|
||||
val NON_POLLING_REQUESTS_SINCE_TWEET_CREATION = new Continuous(
|
||||
"time_features.non_polling_requests_since_tweet_creation",
|
||||
Set(PrivateTimestamp).asJava
|
||||
)
|
||||
val TWEET_AGE_RATIO = new Continuous("time_features.tweet_age_ratio")
|
||||
val IS_TWEET_RECYCLED = new Binary("time_features.is_tweet_recycled")
|
||||
// Last Engagement features
|
||||
val LAST_FAVORITE_SINCE_CREATION_HRS = new Continuous(
|
||||
"time_features.earlybird.last_favorite_since_creation_hrs",
|
||||
Set(CountOfPrivateLikes, CountOfPublicLikes).asJava
|
||||
)
|
||||
val LAST_RETWEET_SINCE_CREATION_HRS = new Continuous(
|
||||
"time_features.earlybird.last_retweet_since_creation_hrs",
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava
|
||||
)
|
||||
val LAST_REPLY_SINCE_CREATION_HRS = new Continuous(
|
||||
"time_features.earlybird.last_reply_since_creation_hrs",
|
||||
Set(CountOfPrivateReplies, CountOfPublicReplies).asJava
|
||||
)
|
||||
val LAST_QUOTE_SINCE_CREATION_HRS = new Continuous(
|
||||
"time_features.earlybird.last_quote_since_creation_hrs",
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava
|
||||
)
|
||||
val TIME_SINCE_LAST_FAVORITE_HRS = new Continuous(
|
||||
"time_features.earlybird.time_since_last_favorite",
|
||||
Set(CountOfPrivateLikes, CountOfPublicLikes).asJava
|
||||
)
|
||||
val TIME_SINCE_LAST_RETWEET_HRS = new Continuous(
|
||||
"time_features.earlybird.time_since_last_retweet",
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava
|
||||
)
|
||||
val TIME_SINCE_LAST_REPLY_HRS = new Continuous(
|
||||
"time_features.earlybird.time_since_last_reply",
|
||||
Set(CountOfPrivateReplies, CountOfPublicReplies).asJava
|
||||
)
|
||||
val TIME_SINCE_LAST_QUOTE_HRS = new Continuous(
|
||||
"time_features.earlybird.time_since_last_quote",
|
||||
Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava
|
||||
)
|
||||
|
||||
val TIME_SINCE_VIEWER_ACCOUNT_CREATION_SECS =
|
||||
new Continuous(
|
||||
"time_features.time_since_viewer_account_creation_secs",
|
||||
Set(AccountCreationTime, AgeOfAccount).asJava)
|
||||
|
||||
val USER_ID_IS_SNOWFLAKE_ID =
|
||||
new Binary("time_features.time_user_id_is_snowflake_id", Set(UserType).asJava)
|
||||
|
||||
val IS_30_DAY_NEW_USER =
|
||||
new Binary("time_features.is_day_30_new_user", Set(AccountCreationTime, AgeOfAccount).asJava)
|
||||
val IS_12_MONTH_NEW_USER =
|
||||
new Binary("time_features.is_month_12_new_user", Set(AccountCreationTime, AgeOfAccount).asJava)
|
||||
val ACCOUNT_AGE_INTERVAL =
|
||||
new Discrete("time_features.account_age_interval", Set(AgeOfAccount).asJava)
|
||||
}
|
||||
|
||||
object AccountAgeInterval extends Enumeration {
|
||||
val LTE_1_DAY, GT_1_DAY_LTE_5_DAY, GT_5_DAY_LTE_14_DAY, GT_14_DAY_LTE_30_DAY = Value
|
||||
|
||||
def fromDuration(accountAge: Duration): Option[AccountAgeInterval.Value] = {
|
||||
accountAge match {
|
||||
case a if (a <= 1.day) => Some(LTE_1_DAY)
|
||||
case a if (1.day < a && a <= 5.days) => Some(GT_1_DAY_LTE_5_DAY)
|
||||
case a if (5.days < a && a <= 14.days) => Some(GT_5_DAY_LTE_14_DAY)
|
||||
case a if (14.days < a && a <= 30.days) => Some(GT_14_DAY_LTE_30_DAY)
|
||||
case _ => None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
case class TimeFeatures(
|
||||
isTweetRecycled: Boolean,
|
||||
timeSinceTweetCreation: Double,
|
||||
isDay30NewUser: Boolean,
|
||||
isMonth12NewUser: Boolean,
|
||||
timeSinceSourceTweetCreation: Double, // same as timeSinceTweetCreation for non-retweets
|
||||
timeSinceViewerAccountCreationSecs: Option[Double],
|
||||
timeBetweenNonPollingRequestsAvg: Option[Double] = None,
|
||||
timeSinceLastNonPollingRequest: Option[Double] = None,
|
||||
nonPollingRequestsSinceTweetCreation: Option[Double] = None,
|
||||
tweetAgeRatio: Option[Double] = None,
|
||||
lastFavSinceCreationHrs: Option[Double] = None,
|
||||
lastRetweetSinceCreationHrs: Option[Double] = None,
|
||||
lastReplySinceCreationHrs: Option[Double] = None,
|
||||
lastQuoteSinceCreationHrs: Option[Double] = None,
|
||||
timeSinceLastFavoriteHrs: Option[Double] = None,
|
||||
timeSinceLastRetweetHrs: Option[Double] = None,
|
||||
timeSinceLastReplyHrs: Option[Double] = None,
|
||||
timeSinceLastQuoteHrs: Option[Double] = None,
|
||||
accountAgeInterval: Option[AccountAgeInterval.Value] = None)
|
|
@ -0,0 +1,10 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"graph-feature-service/src/main/thrift/com/twitter/graph_feature_service:graph_feature_service_thrift-scala",
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,93 @@
|
|||
package com.twitter.timelines.prediction.features.two_hop_features
|
||||
|
||||
import com.twitter.graph_feature_service.thriftscala.EdgeType
|
||||
import com.twitter.ml.api.Feature._
|
||||
import scala.collection.JavaConverters._
|
||||
import TwoHopFeaturesConfig.personalDataTypesMap
|
||||
|
||||
object TwoHopFeaturesDescriptor {
|
||||
val prefix = "two_hop"
|
||||
val normalizedPostfix = "normalized"
|
||||
val leftNodeDegreePostfix = "left_degree"
|
||||
val rightNodeDegreePostfix = "right_degree"
|
||||
|
||||
type TwoHopFeatureMap = Map[(EdgeType, EdgeType), Continuous]
|
||||
type TwoHopFeatureNodeDegreeMap = Map[EdgeType, Continuous]
|
||||
|
||||
def apply(edgeTypePairs: Seq[(EdgeType, EdgeType)]): TwoHopFeaturesDescriptor = {
|
||||
new TwoHopFeaturesDescriptor(edgeTypePairs)
|
||||
}
|
||||
}
|
||||
|
||||
class TwoHopFeaturesDescriptor(edgeTypePairs: Seq[(EdgeType, EdgeType)]) {
|
||||
import TwoHopFeaturesDescriptor._
|
||||
|
||||
def getLeftEdge(edgeTypePair: (EdgeType, EdgeType)): EdgeType = {
|
||||
edgeTypePair._1
|
||||
}
|
||||
|
||||
def getLeftEdgeName(edgeTypePair: (EdgeType, EdgeType)): String = {
|
||||
getLeftEdge(edgeTypePair).originalName.toLowerCase
|
||||
}
|
||||
|
||||
def getRightEdge(edgeTypePair: (EdgeType, EdgeType)): EdgeType = {
|
||||
edgeTypePair._2
|
||||
}
|
||||
|
||||
def getRightEdgeName(edgeTypePair: (EdgeType, EdgeType)): String = {
|
||||
getRightEdge(edgeTypePair).originalName.toLowerCase
|
||||
}
|
||||
|
||||
val rawFeaturesMap: TwoHopFeatureMap = edgeTypePairs.map(edgeTypePair => {
|
||||
val leftEdgeType = getLeftEdge(edgeTypePair)
|
||||
val leftEdgeName = getLeftEdgeName(edgeTypePair)
|
||||
val rightEdgeType = getRightEdge(edgeTypePair)
|
||||
val rightEdgeName = getRightEdgeName(edgeTypePair)
|
||||
val personalDataTypes = (
|
||||
personalDataTypesMap.getOrElse(leftEdgeType, Set.empty) ++
|
||||
personalDataTypesMap.getOrElse(rightEdgeType, Set.empty)
|
||||
).asJava
|
||||
val rawFeature = new Continuous(s"$prefix.$leftEdgeName.$rightEdgeName", personalDataTypes)
|
||||
edgeTypePair -> rawFeature
|
||||
})(collection.breakOut)
|
||||
|
||||
val leftNodeDegreeFeaturesMap: TwoHopFeatureNodeDegreeMap = edgeTypePairs.map(edgeTypePair => {
|
||||
val leftEdgeType = getLeftEdge(edgeTypePair)
|
||||
val leftEdgeName = getLeftEdgeName(edgeTypePair)
|
||||
val personalDataTypes = personalDataTypesMap.getOrElse(leftEdgeType, Set.empty).asJava
|
||||
val leftNodeDegreeFeature =
|
||||
new Continuous(s"$prefix.$leftEdgeName.$leftNodeDegreePostfix", personalDataTypes)
|
||||
leftEdgeType -> leftNodeDegreeFeature
|
||||
})(collection.breakOut)
|
||||
|
||||
val rightNodeDegreeFeaturesMap: TwoHopFeatureNodeDegreeMap = edgeTypePairs.map(edgeTypePair => {
|
||||
val rightEdgeType = getRightEdge(edgeTypePair)
|
||||
val rightEdgeName = getRightEdgeName(edgeTypePair)
|
||||
val personalDataTypes = personalDataTypesMap.getOrElse(rightEdgeType, Set.empty).asJava
|
||||
val rightNodeDegreeFeature =
|
||||
new Continuous(s"$prefix.$rightEdgeName.$rightNodeDegreePostfix", personalDataTypes)
|
||||
rightEdgeType -> rightNodeDegreeFeature
|
||||
})(collection.breakOut)
|
||||
|
||||
val normalizedFeaturesMap: TwoHopFeatureMap = edgeTypePairs.map(edgeTypePair => {
|
||||
val leftEdgeType = getLeftEdge(edgeTypePair)
|
||||
val leftEdgeName = getLeftEdgeName(edgeTypePair)
|
||||
val rightEdgeType = getRightEdge(edgeTypePair)
|
||||
val rightEdgeName = getRightEdgeName(edgeTypePair)
|
||||
val personalDataTypes = (
|
||||
personalDataTypesMap.getOrElse(leftEdgeType, Set.empty) ++
|
||||
personalDataTypesMap.getOrElse(rightEdgeType, Set.empty)
|
||||
).asJava
|
||||
val normalizedFeature =
|
||||
new Continuous(s"$prefix.$leftEdgeName.$rightEdgeName.$normalizedPostfix", personalDataTypes)
|
||||
edgeTypePair -> normalizedFeature
|
||||
})(collection.breakOut)
|
||||
|
||||
private val rawFeaturesSeq: Seq[Continuous] = rawFeaturesMap.values.toSeq
|
||||
private val leftNodeDegreeFeaturesSeq: Seq[Continuous] = leftNodeDegreeFeaturesMap.values.toSeq
|
||||
private val rightNodeDegreeFeaturesSeq: Seq[Continuous] = rightNodeDegreeFeaturesMap.values.toSeq
|
||||
private val normalizedFeaturesSeq: Seq[Continuous] = normalizedFeaturesMap.values.toSeq
|
||||
|
||||
val featuresSeq: Seq[Continuous] =
|
||||
rawFeaturesSeq ++ leftNodeDegreeFeaturesSeq ++ rightNodeDegreeFeaturesSeq ++ normalizedFeaturesSeq
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
package com.twitter.timelines.prediction.features.two_hop_features
|
||||
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType
|
||||
import com.twitter.graph_feature_service.thriftscala.{EdgeType, FeatureType}
|
||||
|
||||
object TwoHopFeaturesConfig {
|
||||
val leftEdgeTypes = Seq(EdgeType.Following, EdgeType.Favorite, EdgeType.MutualFollow)
|
||||
val rightEdgeTypes = Seq(
|
||||
EdgeType.FollowedBy,
|
||||
EdgeType.FavoritedBy,
|
||||
EdgeType.RetweetedBy,
|
||||
EdgeType.MentionedBy,
|
||||
EdgeType.MutualFollow)
|
||||
|
||||
val edgeTypePairs: Seq[(EdgeType, EdgeType)] = {
|
||||
for (leftEdgeType <- leftEdgeTypes; rightEdgeType <- rightEdgeTypes)
|
||||
yield (leftEdgeType, rightEdgeType)
|
||||
}
|
||||
|
||||
val featureTypes: Seq[FeatureType] = edgeTypePairs.map(pair => FeatureType(pair._1, pair._2))
|
||||
|
||||
val personalDataTypesMap: Map[EdgeType, Set[PersonalDataType]] = Map(
|
||||
EdgeType.Following -> Set(PersonalDataType.CountOfFollowersAndFollowees),
|
||||
EdgeType.Favorite -> Set(
|
||||
PersonalDataType.CountOfPrivateLikes,
|
||||
PersonalDataType.CountOfPublicLikes),
|
||||
EdgeType.MutualFollow -> Set(PersonalDataType.CountOfFollowersAndFollowees),
|
||||
EdgeType.FollowedBy -> Set(PersonalDataType.CountOfFollowersAndFollowees)
|
||||
)
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
"src/thrift/com/twitter/timelines/author_features/user_health:thrift-scala",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,23 @@
|
|||
package com.twitter.timelines.prediction.features.user_health
|
||||
|
||||
import com.twitter.ml.api.Feature
|
||||
import com.twitter.timelines.author_features.user_health.thriftscala.UserState
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType.{UserState => UserStatePDT}
|
||||
import com.twitter.dal.personal_data.thriftjava.PersonalDataType._
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object UserHealthFeatures {
|
||||
val UserState = new Feature.Discrete("user_health.user_state", Set(UserStatePDT, UserType).asJava)
|
||||
val IsLightMinusUser =
|
||||
new Feature.Binary("user_health.is_light_minus_user", Set(UserStatePDT, UserType).asJava)
|
||||
val AuthorState =
|
||||
new Feature.Discrete("user_health.author_state", Set(UserStatePDT, UserType).asJava)
|
||||
val NumAuthorFollowers =
|
||||
new Feature.Continuous("author_health.num_followers", Set(CountOfFollowersAndFollowees).asJava)
|
||||
val NumAuthorConnectDays = new Feature.Continuous("author_health.num_connect_days")
|
||||
val NumAuthorConnect = new Feature.Continuous("author_health.num_connect")
|
||||
|
||||
val IsUserVerifiedUnion = new Feature.Binary("user_account.is_user_verified_union")
|
||||
}
|
||||
|
||||
case class UserHealthFeatures(id: Long, userStateOpt: Option[UserState])
|
|
@ -0,0 +1,124 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
|
||||
|
||||
import com.twitter.ml.api._
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetric
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.EasyMetric
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.MaxMetric
|
||||
import com.twitter.timelines.data_processing.ml_util.transforms.OneToSomeTransform
|
||||
import com.twitter.util.Duration
|
||||
import java.lang.{Boolean => JBoolean}
|
||||
import java.lang.{Long => JLong}
|
||||
import scala.language.existentials
|
||||
|
||||
/**
|
||||
* A wrapper for [[com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup]]
|
||||
* (see TypedAggregateGroup.scala) with some convenient syntactic sugar that avoids
|
||||
* the user having to specify different groups for different types of features.
|
||||
* Gets translated into multiple strongly typed TypedAggregateGroup(s)
|
||||
* by the buildTypedAggregateGroups() method defined below.
|
||||
*
|
||||
* @param inputSource Source to compute this aggregate over
|
||||
* @param preTransforms Sequence of [[ITransform]] that is applied to
|
||||
* data records pre-aggregation (e.g. discretization, renaming)
|
||||
* @param samplingTransformOpt Optional [[OneToSomeTransform]] that samples data record
|
||||
* @param aggregatePrefix Prefix to use for naming resultant aggregate features
|
||||
* @param keys Features to group by when computing the aggregates
|
||||
* (e.g. USER_ID, AUTHOR_ID). These must be either discrete, string or sparse binary.
|
||||
* Grouping by a sparse binary feature is different than grouping by a discrete or string
|
||||
* feature. For example, if you have a sparse binary feature WORDS_IN_TWEET which is
|
||||
* a set of all words in a tweet, then grouping by this feature generates a
|
||||
* separate aggregate mean/count/etc for each value of the feature (each word), and
|
||||
* not just a single aggregate count for different "sets of words"
|
||||
* @param features Features to aggregate (e.g. blender_score or is_photo).
|
||||
* @param labels Labels to cross the features with to make pair features, if any.
|
||||
* @param metrics Aggregation metrics to compute (e.g. count, mean)
|
||||
* @param halfLives Half lives to use for the aggregations, to be crossed with the above.
|
||||
* use Duration.Top for "forever" aggregations over an infinite time window (no decay).
|
||||
* @param outputStore Store to output this aggregate to
|
||||
* @param includeAnyFeature Aggregate label counts for any feature value
|
||||
* @param includeAnyLabel Aggregate feature counts for any label value (e.g. all impressions)
|
||||
* @param includeTimestampFeature compute max aggregate on timestamp feature
|
||||
* @param aggExclusionRegex Sequence of Regexes, which define features to
|
||||
*/
|
||||
case class AggregateGroup(
|
||||
inputSource: AggregateSource,
|
||||
aggregatePrefix: String,
|
||||
keys: Set[Feature[_]],
|
||||
features: Set[Feature[_]],
|
||||
labels: Set[_ <: Feature[JBoolean]],
|
||||
metrics: Set[EasyMetric],
|
||||
halfLives: Set[Duration],
|
||||
outputStore: AggregateStore,
|
||||
preTransforms: Seq[OneToSomeTransform] = Seq.empty,
|
||||
includeAnyFeature: Boolean = true,
|
||||
includeAnyLabel: Boolean = true,
|
||||
includeTimestampFeature: Boolean = false,
|
||||
aggExclusionRegex: Seq[String] = Seq.empty) {
|
||||
|
||||
private def toStrongType[T](
|
||||
metrics: Set[EasyMetric],
|
||||
features: Set[Feature[_]],
|
||||
featureType: FeatureType
|
||||
): TypedAggregateGroup[_] = {
|
||||
val underlyingMetrics: Set[AggregationMetric[T, _]] =
|
||||
metrics.flatMap(_.forFeatureType[T](featureType))
|
||||
val underlyingFeatures: Set[Feature[T]] = features
|
||||
.map(_.asInstanceOf[Feature[T]])
|
||||
|
||||
TypedAggregateGroup[T](
|
||||
inputSource = inputSource,
|
||||
aggregatePrefix = aggregatePrefix,
|
||||
keysToAggregate = keys,
|
||||
featuresToAggregate = underlyingFeatures,
|
||||
labels = labels,
|
||||
metrics = underlyingMetrics,
|
||||
halfLives = halfLives,
|
||||
outputStore = outputStore,
|
||||
preTransforms = preTransforms,
|
||||
includeAnyFeature,
|
||||
includeAnyLabel,
|
||||
aggExclusionRegex
|
||||
)
|
||||
}
|
||||
|
||||
private def timestampTypedAggregateGroup: TypedAggregateGroup[_] = {
|
||||
val metrics: Set[AggregationMetric[JLong, _]] =
|
||||
Set(MaxMetric.forFeatureType[JLong](TypedAggregateGroup.timestampFeature.getFeatureType).get)
|
||||
|
||||
TypedAggregateGroup[JLong](
|
||||
inputSource = inputSource,
|
||||
aggregatePrefix = aggregatePrefix,
|
||||
keysToAggregate = keys,
|
||||
featuresToAggregate = Set(TypedAggregateGroup.timestampFeature),
|
||||
labels = Set.empty,
|
||||
metrics = metrics,
|
||||
halfLives = Set(Duration.Top),
|
||||
outputStore = outputStore,
|
||||
preTransforms = preTransforms,
|
||||
includeAnyFeature = false,
|
||||
includeAnyLabel = true,
|
||||
aggExclusionRegex = Seq.empty
|
||||
)
|
||||
}
|
||||
|
||||
def buildTypedAggregateGroups(): List[TypedAggregateGroup[_]] = {
|
||||
val typedAggregateGroupsList = {
|
||||
if (features.isEmpty) {
|
||||
List(toStrongType(metrics, features, FeatureType.BINARY))
|
||||
} else {
|
||||
features
|
||||
.groupBy(_.getFeatureType())
|
||||
.toList
|
||||
.map {
|
||||
case (featureType, features) =>
|
||||
toStrongType(metrics, features, featureType)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
val optionalTimestampTypedAggregateGroup =
|
||||
if (includeTimestampFeature) List(timestampTypedAggregateGroup) else List()
|
||||
|
||||
typedAggregateGroupsList ++ optionalTimestampTypedAggregateGroup
|
||||
}
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
|
||||
|
||||
import com.twitter.ml.api.Feature
|
||||
import java.lang.{Long => JLong}
|
||||
|
||||
trait AggregateSource extends Serializable {
|
||||
def name: String
|
||||
def timestampFeature: Feature[JLong]
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
|
||||
|
||||
trait AggregateStore extends Serializable {
|
||||
def name: String
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
|
||||
|
||||
trait AggregationConfig {
|
||||
def aggregatesToCompute: Set[TypedAggregateGroup[_]]
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
|
||||
|
||||
import com.twitter.bijection.Bufferable
|
||||
import com.twitter.bijection.Injection
|
||||
import scala.util.Try
|
||||
|
||||
/**
|
||||
* Case class that represents the "grouping" key for any aggregate feature.
|
||||
* Used by Summingbird to output aggregates to the key-value "store" using sumByKey()
|
||||
*
|
||||
* @discreteFeaturesById All discrete featureids (+ values) that are part of this key
|
||||
* @textFeaturesById All string featureids (+ values) that are part of this key
|
||||
*
|
||||
* Example 1: the user aggregate features in aggregatesv1 all group by USER_ID,
|
||||
* which is a discrete feature. When storing these features, the key would be:
|
||||
*
|
||||
* discreteFeaturesById = Map(hash(USER_ID) -> <the actual user id>), textFeaturesById = Map()
|
||||
*
|
||||
* Ex 2: If aggregating grouped by USER_ID, AUTHOR_ID, tweet link url, the key would be:
|
||||
*
|
||||
* discreteFeaturesById = Map(hash(USER_ID) -> <actual user id>, hash(AUTHOR_ID) -> <actual author id>),
|
||||
* textFeaturesById = Map(hash(URL_FEATURE) -> <the link url>)
|
||||
*
|
||||
* I could have just used a DataRecord for the key, but I wanted to make it strongly typed
|
||||
* and only support grouping by discrete and string features, so using a case class instead.
|
||||
*
|
||||
* Re: efficiency, storing the hash of the feature in addition to just the feature value
|
||||
* is somewhat more inefficient than only storing the feature value in the key, but it
|
||||
* adds flexibility to group multiple types of aggregates in the same output store. If we
|
||||
* decide this isn't a good tradeoff to make later, we can reverse/refactor this decision.
|
||||
*/
|
||||
case class AggregationKey(
|
||||
discreteFeaturesById: Map[Long, Long],
|
||||
textFeaturesById: Map[Long, String])
|
||||
|
||||
/**
|
||||
* A custom injection for the above case class,
|
||||
* so that Summingbird knows how to store it in Manhattan.
|
||||
*/
|
||||
object AggregationKeyInjection extends Injection[AggregationKey, Array[Byte]] {
|
||||
/* Injection from tuple representation of AggregationKey to Array[Byte] */
|
||||
val featureMapsInjection: Injection[(Map[Long, Long], Map[Long, String]), Array[Byte]] =
|
||||
Bufferable.injectionOf[(Map[Long, Long], Map[Long, String])]
|
||||
|
||||
def apply(aggregationKey: AggregationKey): Array[Byte] =
|
||||
featureMapsInjection(AggregationKey.unapply(aggregationKey).get)
|
||||
|
||||
def invert(ab: Array[Byte]): Try[AggregationKey] =
|
||||
featureMapsInjection.invert(ab).map(AggregationKey.tupled(_))
|
||||
}
|
101
timelines/data_processing/ml_util/aggregation_framework/BUILD
Normal file
101
timelines/data_processing/ml_util/aggregation_framework/BUILD
Normal file
|
@ -0,0 +1,101 @@
|
|||
scala_library(
|
||||
name = "common_types",
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/google/guava",
|
||||
"3rdparty/jvm/com/twitter/algebird:bijection",
|
||||
"3rdparty/jvm/com/twitter/algebird:core",
|
||||
"3rdparty/jvm/com/twitter/algebird:util",
|
||||
"3rdparty/jvm/com/twitter/bijection:core",
|
||||
"3rdparty/jvm/com/twitter/bijection:json",
|
||||
"3rdparty/jvm/com/twitter/bijection:macros",
|
||||
"3rdparty/jvm/com/twitter/bijection:netty",
|
||||
"3rdparty/jvm/com/twitter/bijection:scrooge",
|
||||
"3rdparty/jvm/com/twitter/bijection:thrift",
|
||||
"3rdparty/jvm/com/twitter/bijection:util",
|
||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
||||
"3rdparty/src/jvm/com/twitter/scalding:date",
|
||||
"3rdparty/src/jvm/com/twitter/summingbird:batch",
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/java/com/twitter/ml/api/constant",
|
||||
"src/scala/com/twitter/dal/client/dataset",
|
||||
"src/scala/com/twitter/ml/api/util:datarecord",
|
||||
"src/scala/com/twitter/scalding_internal/dalv2/vkvs",
|
||||
"src/scala/com/twitter/scalding_internal/multiformat/format/keyval",
|
||||
"src/scala/com/twitter/storehaus_internal/manhattan/config",
|
||||
"src/scala/com/twitter/storehaus_internal/offline",
|
||||
"src/scala/com/twitter/storehaus_internal/util",
|
||||
"src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits",
|
||||
"src/scala/com/twitter/summingbird_internal/runner/store_config",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-scala",
|
||||
"src/thrift/com/twitter/ml/api:data-java",
|
||||
"timelines/data_processing/ml_util/aggregation_framework/metrics",
|
||||
"timelines/data_processing/ml_util/transforms",
|
||||
"util/util-core:util-core-util",
|
||||
],
|
||||
)
|
||||
|
||||
target(
|
||||
name = "common_online_stores",
|
||||
dependencies = [
|
||||
"src/scala/com/twitter/storehaus_internal/memcache",
|
||||
],
|
||||
)
|
||||
|
||||
target(
|
||||
name = "common_offline_stores",
|
||||
dependencies = [
|
||||
"src/scala/com/twitter/storehaus_internal/manhattan",
|
||||
],
|
||||
)
|
||||
|
||||
target(
|
||||
name = "user_job",
|
||||
dependencies = [
|
||||
"timelines/data_processing/ml_util/aggregation_framework/job",
|
||||
],
|
||||
)
|
||||
|
||||
target(
|
||||
name = "scalding",
|
||||
dependencies = [
|
||||
"timelines/data_processing/ml_util/aggregation_framework/scalding",
|
||||
],
|
||||
)
|
||||
|
||||
target(
|
||||
name = "conversion",
|
||||
dependencies = [
|
||||
"timelines/data_processing/ml_util/aggregation_framework/conversion",
|
||||
],
|
||||
)
|
||||
|
||||
target(
|
||||
name = "query",
|
||||
dependencies = [
|
||||
"timelines/data_processing/ml_util/aggregation_framework/query",
|
||||
],
|
||||
)
|
||||
|
||||
target(
|
||||
name = "heron",
|
||||
dependencies = [
|
||||
"timelines/data_processing/ml_util/aggregation_framework/heron",
|
||||
],
|
||||
)
|
||||
|
||||
target(
|
||||
dependencies = [
|
||||
":common_offline_stores",
|
||||
":common_online_stores",
|
||||
":common_types",
|
||||
":conversion",
|
||||
":heron",
|
||||
":query",
|
||||
":scalding",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,92 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
|
||||
|
||||
import com.twitter.algebird.Monoid
|
||||
import com.twitter.ml.api._
|
||||
import com.twitter.ml.api.constant.SharedFeatures
|
||||
import com.twitter.ml.api.util.SRichDataRecord
|
||||
import scala.collection.mutable
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon._
|
||||
|
||||
/**
|
||||
* Monoid to aggregate over DataRecord objects.
|
||||
*
|
||||
* @param aggregates Set of ''TypedAggregateGroup'' case classes*
|
||||
* to compute using this monoid (see TypedAggregateGroup.scala)
|
||||
*/
|
||||
trait DataRecordMonoid extends Monoid[DataRecord] {
|
||||
|
||||
val aggregates: Set[TypedAggregateGroup[_]]
|
||||
|
||||
def zero(): DataRecord = new DataRecord
|
||||
|
||||
/*
|
||||
* Add two datarecords using this monoid.
|
||||
*
|
||||
* @param left Left datarecord to add
|
||||
* @param right Right datarecord to add
|
||||
* @return Sum of the two datarecords as a DataRecord
|
||||
*/
|
||||
def plus(left: DataRecord, right: DataRecord): DataRecord = {
|
||||
val result = zero()
|
||||
aggregates.foreach(_.mutatePlus(result, left, right))
|
||||
val leftTimestamp = getTimestamp(left)
|
||||
val rightTimestamp = getTimestamp(right)
|
||||
SRichDataRecord(result).setFeatureValue(
|
||||
SharedFeatures.TIMESTAMP,
|
||||
leftTimestamp.max(rightTimestamp)
|
||||
)
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
case class DataRecordAggregationMonoid(aggregates: Set[TypedAggregateGroup[_]])
|
||||
extends DataRecordMonoid {
|
||||
|
||||
private def sumBuffer(buffer: mutable.ArrayBuffer[DataRecord]): Unit = {
|
||||
val bufferSum = zero()
|
||||
buffer.toIterator.foreach { value =>
|
||||
val leftTimestamp = getTimestamp(bufferSum)
|
||||
val rightTimestamp = getTimestamp(value)
|
||||
aggregates.foreach(_.mutatePlus(bufferSum, bufferSum, value))
|
||||
SRichDataRecord(bufferSum).setFeatureValue(
|
||||
SharedFeatures.TIMESTAMP,
|
||||
leftTimestamp.max(rightTimestamp)
|
||||
)
|
||||
}
|
||||
|
||||
buffer.clear()
|
||||
buffer += bufferSum
|
||||
}
|
||||
|
||||
/*
|
||||
* Efficient batched aggregation of datarecords using
|
||||
* this monoid + a buffer, for performance.
|
||||
*
|
||||
* @param dataRecordIter An iterator of datarecords to sum
|
||||
* @return A datarecord option containing the sum
|
||||
*/
|
||||
override def sumOption(dataRecordIter: TraversableOnce[DataRecord]): Option[DataRecord] = {
|
||||
if (dataRecordIter.isEmpty) {
|
||||
None
|
||||
} else {
|
||||
var buffer = mutable.ArrayBuffer[DataRecord]()
|
||||
val BatchSize = 1000
|
||||
|
||||
dataRecordIter.foreach { u =>
|
||||
if (buffer.size > BatchSize) sumBuffer(buffer)
|
||||
buffer += u
|
||||
}
|
||||
|
||||
if (buffer.size > 1) sumBuffer(buffer)
|
||||
Some(buffer(0))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This class is used when there is no need to use sumBuffer functionality, as in the case of
|
||||
* online aggregation of datarecords where using a buffer on a small number of datarecords
|
||||
* would add some performance overhead.
|
||||
*/
|
||||
case class DataRecordAggregationMonoidNoBuffer(aggregates: Set[TypedAggregateGroup[_]])
|
||||
extends DataRecordMonoid {}
|
|
@ -0,0 +1,27 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
|
||||
|
||||
import com.twitter.ml.api.DataRecord
|
||||
|
||||
/**
|
||||
* Keyed record that is used to reprsent the aggregation type and its corresponding data record.
|
||||
*
|
||||
* @constructor creates a new keyed record.
|
||||
*
|
||||
* @param aggregateType the aggregate type
|
||||
* @param record the data record associated with the key
|
||||
**/
|
||||
case class KeyedRecord(aggregateType: AggregateType.Value, record: DataRecord)
|
||||
|
||||
/**
|
||||
* Keyed record map with multiple data record.
|
||||
*
|
||||
* @constructor creates a new keyed record map.
|
||||
*
|
||||
* @param aggregateType the aggregate type
|
||||
* @param recordMap a map with key of type Long and value of type DataRecord
|
||||
* where the key indicates the index and the value indicating the record
|
||||
*
|
||||
**/
|
||||
case class KeyedRecordMap(
|
||||
aggregateType: AggregateType.Value,
|
||||
recordMap: scala.collection.Map[Long, DataRecord])
|
|
@ -0,0 +1,46 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
|
||||
|
||||
import com.twitter.dal.personal_data.thriftscala.PersonalDataType
|
||||
import com.twitter.ml.api.DataRecord
|
||||
import com.twitter.ml.api.Feature
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Batched
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.JavaCompactThrift
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.genericInjection
|
||||
import com.twitter.summingbird.batch.BatchID
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object OfflineAggregateInjections {
|
||||
val offlineDataRecordAggregateInjection: KeyValInjection[AggregationKey, (BatchID, DataRecord)] =
|
||||
KeyValInjection(
|
||||
genericInjection(AggregationKeyInjection),
|
||||
Batched(JavaCompactThrift[DataRecord])
|
||||
)
|
||||
|
||||
private[aggregation_framework] def getPdts[T](
|
||||
aggregateGroups: Iterable[T],
|
||||
featureExtractor: T => Iterable[Feature[_]]
|
||||
): Option[Set[PersonalDataType]] = {
|
||||
val pdts: Set[PersonalDataType] = for {
|
||||
group <- aggregateGroups.toSet[T]
|
||||
feature <- featureExtractor(group)
|
||||
pdtSet <- feature.getPersonalDataTypes.asSet().asScala
|
||||
javaPdt <- pdtSet.asScala
|
||||
scalaPdt <- PersonalDataType.get(javaPdt.getValue)
|
||||
} yield {
|
||||
scalaPdt
|
||||
}
|
||||
if (pdts.nonEmpty) Some(pdts) else None
|
||||
}
|
||||
|
||||
def getInjection(
|
||||
aggregateGroups: Set[TypedAggregateGroup[_]]
|
||||
): KeyValInjection[AggregationKey, (BatchID, DataRecord)] = {
|
||||
val keyPdts = getPdts[TypedAggregateGroup[_]](aggregateGroups, _.allOutputKeys)
|
||||
val valuePdts = getPdts[TypedAggregateGroup[_]](aggregateGroups, _.allOutputFeatures)
|
||||
KeyValInjection(
|
||||
genericInjection(AggregationKeyInjection, keyPdts),
|
||||
genericInjection(Batched(JavaCompactThrift[DataRecord]), valuePdts)
|
||||
)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
|
||||
|
||||
import com.twitter.dal.client.dataset.TimePartitionedDALDataset
|
||||
import com.twitter.ml.api.DataRecord
|
||||
import com.twitter.ml.api.Feature
|
||||
import java.lang.{Long => JLong}
|
||||
|
||||
case class OfflineAggregateSource(
|
||||
override val name: String,
|
||||
override val timestampFeature: Feature[JLong],
|
||||
scaldingHdfsPath: Option[String] = None,
|
||||
scaldingSuffixType: Option[String] = None,
|
||||
dalDataSet: Option[TimePartitionedDALDataset[DataRecord]] = None,
|
||||
withValidation: Boolean = true) // context: https://jira.twitter.biz/browse/TQ-10618
|
||||
extends AggregateSource {
|
||||
/*
|
||||
* Th help transition callers to use DAL.read, we check that either the HDFS
|
||||
* path is defined, or the dalDataset. Both options cannot be set at the same time.
|
||||
*/
|
||||
assert(!(scaldingHdfsPath.isDefined && dalDataSet.isDefined))
|
||||
}
|
|
@ -0,0 +1,128 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
|
||||
|
||||
import com.twitter.dal.client.dataset.KeyValDALDataset
|
||||
import com.twitter.ml.api.DataRecord
|
||||
import com.twitter.scalding.DateParser
|
||||
import com.twitter.scalding.RichDate
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
||||
import com.twitter.storehaus_internal.manhattan._
|
||||
import com.twitter.storehaus_internal.util.ApplicationID
|
||||
import com.twitter.storehaus_internal.util.DatasetName
|
||||
import com.twitter.storehaus_internal.util.HDFSPath
|
||||
import com.twitter.summingbird.batch.BatchID
|
||||
import com.twitter.summingbird.batch.Batcher
|
||||
import com.twitter.summingbird_internal.runner.store_config._
|
||||
import java.util.TimeZone
|
||||
import com.twitter.summingbird.batch.MillisecondBatcher
|
||||
|
||||
/*
|
||||
* Configuration common to all offline aggregate stores
|
||||
*
|
||||
* @param outputHdfsPathPrefix HDFS prefix to store all output aggregate types offline
|
||||
* @param dummyAppId Dummy manhattan app id required by summingbird (unused)
|
||||
* @param dummyDatasetPrefix Dummy manhattan dataset prefix required by summingbird (unused)
|
||||
* @param startDate Start date for summingbird job to begin computing aggregates
|
||||
*/
|
||||
case class OfflineAggregateStoreCommonConfig(
|
||||
outputHdfsPathPrefix: String,
|
||||
dummyAppId: String,
|
||||
dummyDatasetPrefix: String,
|
||||
startDate: String)
|
||||
|
||||
/**
|
||||
* A trait inherited by any object that defines
|
||||
* a HDFS prefix to write output data to. E.g. timelines has its own
|
||||
* output prefix to write aggregates_v2 results, your team can create
|
||||
* its own.
|
||||
*/
|
||||
trait OfflineStoreCommonConfig extends Serializable {
|
||||
/*
|
||||
* @param startDate Date to create config for
|
||||
* @return OfflineAggregateStoreCommonConfig object with all config details for output populated
|
||||
*/
|
||||
def apply(startDate: String): OfflineAggregateStoreCommonConfig
|
||||
}
|
||||
|
||||
/**
|
||||
* @param name Uniquely identifiable human-readable name for this output store
|
||||
* @param startDate Start date for this output store from which aggregates should be computed
|
||||
* @param commonConfig Provider of other common configuration details
|
||||
* @param batchesToKeep Retention policy on output (number of batches to keep)
|
||||
*/
|
||||
abstract class OfflineAggregateStoreBase
|
||||
extends OfflineStoreOnlyConfig[ManhattanROConfig]
|
||||
with AggregateStore {
|
||||
|
||||
override def name: String
|
||||
def startDate: String
|
||||
def commonConfig: OfflineStoreCommonConfig
|
||||
def batchesToKeep: Int
|
||||
def maxKvSourceFailures: Int
|
||||
|
||||
val datedCommonConfig: OfflineAggregateStoreCommonConfig = commonConfig.apply(startDate)
|
||||
val manhattan: ManhattanROConfig = ManhattanROConfig(
|
||||
/* This is a sample config, will be replaced with production config later */
|
||||
HDFSPath(s"${datedCommonConfig.outputHdfsPathPrefix}/${name}"),
|
||||
ApplicationID(datedCommonConfig.dummyAppId),
|
||||
DatasetName(s"${datedCommonConfig.dummyDatasetPrefix}_${name}_1"),
|
||||
com.twitter.storehaus_internal.manhattan.Adama
|
||||
)
|
||||
|
||||
val batcherSize = 24
|
||||
val batcher: MillisecondBatcher = Batcher.ofHours(batcherSize)
|
||||
|
||||
val startTime: RichDate =
|
||||
RichDate(datedCommonConfig.startDate)(TimeZone.getTimeZone("UTC"), DateParser.default)
|
||||
|
||||
val offline: ManhattanROConfig = manhattan
|
||||
}
|
||||
|
||||
/**
|
||||
* Defines an aggregates store which is composed of DataRecords
|
||||
* @param name Uniquely identifiable human-readable name for this output store
|
||||
* @param startDate Start date for this output store from which aggregates should be computed
|
||||
* @param commonConfig Provider of other common configuration details
|
||||
* @param batchesToKeep Retention policy on output (number of batches to keep)
|
||||
*/
|
||||
case class OfflineAggregateDataRecordStore(
|
||||
override val name: String,
|
||||
override val startDate: String,
|
||||
override val commonConfig: OfflineStoreCommonConfig,
|
||||
override val batchesToKeep: Int = 7,
|
||||
override val maxKvSourceFailures: Int = 0)
|
||||
extends OfflineAggregateStoreBase {
|
||||
|
||||
def toOfflineAggregateDataRecordStoreWithDAL(
|
||||
dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]]
|
||||
): OfflineAggregateDataRecordStoreWithDAL =
|
||||
OfflineAggregateDataRecordStoreWithDAL(
|
||||
name = name,
|
||||
startDate = startDate,
|
||||
commonConfig = commonConfig,
|
||||
dalDataset = dalDataset,
|
||||
maxKvSourceFailures = maxKvSourceFailures
|
||||
)
|
||||
}
|
||||
|
||||
trait withDALDataset {
|
||||
def dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]]
|
||||
}
|
||||
|
||||
/**
|
||||
* Defines an aggregates store which is composed of DataRecords and writes using DAL.
|
||||
* @param name Uniquely identifiable human-readable name for this output store
|
||||
* @param startDate Start date for this output store from which aggregates should be computed
|
||||
* @param commonConfig Provider of other common configuration details
|
||||
* @param dalDataset The KeyValDALDataset for this output store
|
||||
* @param batchesToKeep Unused, kept for interface compatibility. You must define a separate Oxpecker
|
||||
* retention policy to maintain the desired number of versions.
|
||||
*/
|
||||
case class OfflineAggregateDataRecordStoreWithDAL(
|
||||
override val name: String,
|
||||
override val startDate: String,
|
||||
override val commonConfig: OfflineStoreCommonConfig,
|
||||
override val dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]],
|
||||
override val batchesToKeep: Int = -1,
|
||||
override val maxKvSourceFailures: Int = 0)
|
||||
extends OfflineAggregateStoreBase
|
||||
with withDALDataset
|
|
@ -0,0 +1,39 @@
|
|||
Overview
|
||||
========
|
||||
|
||||
|
||||
The **aggregation framework** is a set of libraries and utilities that allows teams to flexibly
|
||||
compute aggregate (counting) features in both batch and in real-time. Aggregate features can capture
|
||||
historical interactions between on arbitrary entities (and sets thereof), conditional on provided features
|
||||
and labels.
|
||||
|
||||
These types of engineered aggregate features have proven to be highly impactful across different teams at Twitter.
|
||||
|
||||
|
||||
What are some features we can compute?
|
||||
--------------------------------------
|
||||
|
||||
The framework supports computing aggregate features on provided grouping keys. The only constraint is that these keys are sparse binary features (or are sets thereof).
|
||||
|
||||
For example, a common use case is to calculate a user's past engagement history with various types of tweets (photo, video, retweets, etc.), specific authors, specific in-network engagers or any other entity the user has interacted with and that could provide signal. In this case, the underlying aggregation keys are `userId`, `(userId, authorId)` or `(userId, engagerId)`.
|
||||
|
||||
In Timelines and MagicRecs, we also compute custom aggregate engagement counts on every `tweetId`. Similary, other aggregations are possible, perhaps on `advertiserId` or `mediaId` as long as the grouping key is sparse binary.
|
||||
|
||||
|
||||
What implementations are supported?
|
||||
-----------------------------------
|
||||
|
||||
Offline, we support the daily batch processing of DataRecords containing all required input features to generate
|
||||
aggregate features. These are then uploaded to Manhattan for online hydration.
|
||||
|
||||
Online, we support the real-time aggregation of DataRecords through Storm with a backing memcache that can be queried
|
||||
for the real-time aggregate features.
|
||||
|
||||
Additional documentation exists in the [docs folder](docs)
|
||||
|
||||
|
||||
Where is this used?
|
||||
--------------------
|
||||
|
||||
The Home Timeline heavy ranker uses a varierty of both [batch and real time features](../../../../src/scala/com/twitter/timelines/prediction/common/aggregates/README.md) generated by this framework.
|
||||
These features are also used for email and other recommendations.
|
|
@ -0,0 +1,68 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
|
||||
|
||||
import com.twitter.ml.api.constant.SharedFeatures
|
||||
import com.twitter.ml.api.Feature
|
||||
import com.twitter.ml.api.FeatureType
|
||||
|
||||
/**
|
||||
* Convenience class to describe the stores that make up a particular type of aggregate.
|
||||
*
|
||||
* For example, as of 2018/07, user aggregates are generate by merging the individual
|
||||
* "user_aggregates", "rectweet_user_aggregates", and, "twitter_wide_user_aggregates".
|
||||
*
|
||||
* @param storeNames Name of the stores.
|
||||
* @param aggregateType Type of aggregate, usually differentiated by the aggregation key.
|
||||
* @param shouldHash Used at TimelineRankingAggregatesUtil.extractSecondary when extracting the
|
||||
* secondary key value.
|
||||
*/
|
||||
case class StoreConfig[T](
|
||||
storeNames: Set[String],
|
||||
aggregateType: AggregateType.Value,
|
||||
shouldHash: Boolean = false
|
||||
)(
|
||||
implicit storeMerger: StoreMerger) {
|
||||
require(storeMerger.isValidToMerge(storeNames))
|
||||
|
||||
private val representativeStore = storeNames.head
|
||||
|
||||
val aggregationKeyIds: Set[Long] = storeMerger.getAggregateKeys(representativeStore)
|
||||
val aggregationKeyFeatures: Set[Feature[_]] =
|
||||
storeMerger.getAggregateKeyFeatures(representativeStore)
|
||||
val secondaryKeyFeatureOpt: Option[Feature[_]] = storeMerger.getSecondaryKey(representativeStore)
|
||||
}
|
||||
|
||||
trait StoreMerger {
|
||||
def aggregationConfig: AggregationConfig
|
||||
|
||||
def getAggregateKeyFeatures(storeName: String): Set[Feature[_]] =
|
||||
aggregationConfig.aggregatesToCompute
|
||||
.filter(_.outputStore.name == storeName)
|
||||
.flatMap(_.keysToAggregate)
|
||||
|
||||
def getAggregateKeys(storeName: String): Set[Long] =
|
||||
TypedAggregateGroup.getKeyFeatureIds(getAggregateKeyFeatures(storeName))
|
||||
|
||||
def getSecondaryKey(storeName: String): Option[Feature[_]] = {
|
||||
val keys = getAggregateKeyFeatures(storeName)
|
||||
require(keys.size <= 2, "Only singleton or binary aggregation keys are supported.")
|
||||
require(keys.contains(SharedFeatures.USER_ID), "USER_ID must be one of the aggregation keys.")
|
||||
keys
|
||||
.filterNot(_ == SharedFeatures.USER_ID)
|
||||
.headOption
|
||||
.map { possiblySparseKey =>
|
||||
if (possiblySparseKey.getFeatureType != FeatureType.SPARSE_BINARY) {
|
||||
possiblySparseKey
|
||||
} else {
|
||||
TypedAggregateGroup.sparseFeature(possiblySparseKey)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Stores may only be merged if they have the same aggregation key.
|
||||
*/
|
||||
def isValidToMerge(storeNames: Set[String]): Boolean = {
|
||||
val expectedKeyOpt = storeNames.headOption.map(getAggregateKeys)
|
||||
storeNames.forall(v => getAggregateKeys(v) == expectedKeyOpt.get)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
|
||||
|
||||
trait StoreRegister {
|
||||
def allStores: Set[StoreConfig[_]]
|
||||
|
||||
lazy val storeMap: Map[AggregateType.Value, StoreConfig[_]] = allStores
|
||||
.map(store => (store.aggregateType, store))
|
||||
.toMap
|
||||
|
||||
lazy val storeNameToTypeMap: Map[String, AggregateType.Value] = allStores
|
||||
.flatMap(store => store.storeNames.map(name => (name, store.aggregateType)))
|
||||
.toMap
|
||||
}
|
|
@ -0,0 +1,486 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
|
||||
|
||||
import com.twitter.ml.api._
|
||||
import com.twitter.ml.api.constant.SharedFeatures
|
||||
import com.twitter.ml.api.util.SRichDataRecord
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregateFeature
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetric
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon._
|
||||
import com.twitter.timelines.data_processing.ml_util.transforms.OneToSomeTransform
|
||||
import com.twitter.util.Duration
|
||||
import com.twitter.util.Try
|
||||
import java.lang.{Boolean => JBoolean}
|
||||
import java.lang.{Double => JDouble}
|
||||
import java.lang.{Long => JLong}
|
||||
import java.util.{Set => JSet}
|
||||
import scala.annotation.tailrec
|
||||
import scala.language.existentials
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.util.matching.Regex
|
||||
|
||||
/**
|
||||
* A case class contained precomputed data useful to quickly
|
||||
* process operations over an aggregate.
|
||||
*
|
||||
* @param query The underlying feature being aggregated
|
||||
* @param metric The aggregation metric
|
||||
* @param outputFeatures The output features that aggregation will produce
|
||||
* @param outputFeatureIds The precomputed hashes of the above outputFeatures
|
||||
*/
|
||||
case class PrecomputedAggregateDescriptor[T](
|
||||
query: AggregateFeature[T],
|
||||
metric: AggregationMetric[T, _],
|
||||
outputFeatures: List[Feature[_]],
|
||||
outputFeatureIds: List[JLong])
|
||||
|
||||
object TypedAggregateGroup {
|
||||
|
||||
/**
|
||||
* Recursive function that generates all combinations of value
|
||||
* assignments for a collection of sparse binary features.
|
||||
*
|
||||
* @param sparseBinaryIdValues list of sparse binary feature ids and possible values they can take
|
||||
* @return A set of maps, where each map represents one possible assignment of values to ids
|
||||
*/
|
||||
def sparseBinaryPermutations(
|
||||
sparseBinaryIdValues: List[(Long, Set[String])]
|
||||
): Set[Map[Long, String]] = sparseBinaryIdValues match {
|
||||
case (id, values) +: rest =>
|
||||
tailRecSparseBinaryPermutations(
|
||||
existingPermutations = values.map(value => Map(id -> value)),
|
||||
remainingIdValues = rest
|
||||
)
|
||||
case Nil => Set.empty
|
||||
}
|
||||
|
||||
@tailrec private[this] def tailRecSparseBinaryPermutations(
|
||||
existingPermutations: Set[Map[Long, String]],
|
||||
remainingIdValues: List[(Long, Set[String])]
|
||||
): Set[Map[Long, String]] = remainingIdValues match {
|
||||
case Nil => existingPermutations
|
||||
case (id, values) +: rest =>
|
||||
tailRecSparseBinaryPermutations(
|
||||
existingPermutations.flatMap { existingIdValueMap =>
|
||||
values.map(value => existingIdValueMap ++ Map(id -> value))
|
||||
},
|
||||
rest
|
||||
)
|
||||
}
|
||||
|
||||
val SparseFeatureSuffix = ".member"
|
||||
def sparseFeature(sparseBinaryFeature: Feature[_]): Feature[String] =
|
||||
new Feature.Text(
|
||||
sparseBinaryFeature.getDenseFeatureName + SparseFeatureSuffix,
|
||||
AggregationMetricCommon.derivePersonalDataTypes(Some(sparseBinaryFeature)))
|
||||
|
||||
/* Throws exception if obj not an instance of U */
|
||||
private[this] def validate[U](obj: Any): U = {
|
||||
require(obj.isInstanceOf[U])
|
||||
obj.asInstanceOf[U]
|
||||
}
|
||||
|
||||
private[this] def getFeatureOpt[U](dataRecord: DataRecord, feature: Feature[U]): Option[U] =
|
||||
Option(SRichDataRecord(dataRecord).getFeatureValue(feature)).map(validate[U](_))
|
||||
|
||||
/**
|
||||
* Get a mapping from feature ids
|
||||
* (including individual sparse elements of a sparse feature) to values
|
||||
* from the given data record, for a given feature type.
|
||||
*
|
||||
* @param dataRecord Data record to get features from
|
||||
* @param keysToAggregate key features to get id-value mappings for
|
||||
* @param featureType Feature type to get id-value maps for
|
||||
*/
|
||||
def getKeyFeatureIdValues[U](
|
||||
dataRecord: DataRecord,
|
||||
keysToAggregate: Set[Feature[_]],
|
||||
featureType: FeatureType
|
||||
): Set[(Long, Option[U])] = {
|
||||
val featuresOfThisType: Set[Feature[U]] = keysToAggregate
|
||||
.filter(_.getFeatureType == featureType)
|
||||
.map(validate[Feature[U]])
|
||||
|
||||
featuresOfThisType
|
||||
.map { feature: Feature[U] =>
|
||||
val featureId: Long = getDenseFeatureId(feature)
|
||||
val featureOpt: Option[U] = getFeatureOpt(dataRecord, feature)
|
||||
(featureId, featureOpt)
|
||||
}
|
||||
}
|
||||
|
||||
// TypedAggregateGroup may transform the aggregate keys for internal use. This method generates
|
||||
// denseFeatureIds for the transformed feature.
|
||||
def getDenseFeatureId(feature: Feature[_]): Long =
|
||||
if (feature.getFeatureType != FeatureType.SPARSE_BINARY) {
|
||||
feature.getDenseFeatureId
|
||||
} else {
|
||||
sparseFeature(feature).getDenseFeatureId
|
||||
}
|
||||
|
||||
/**
|
||||
* Return denseFeatureIds for the input features after applying the custom transformation that
|
||||
* TypedAggregateGroup applies to its keysToAggregate.
|
||||
*
|
||||
* @param keysToAggregate key features to get id for
|
||||
*/
|
||||
def getKeyFeatureIds(keysToAggregate: Set[Feature[_]]): Set[Long] =
|
||||
keysToAggregate.map(getDenseFeatureId)
|
||||
|
||||
def checkIfAllKeysExist[U](featureIdValueMap: Map[Long, Option[U]]): Boolean =
|
||||
featureIdValueMap.forall { case (_, valueOpt) => valueOpt.isDefined }
|
||||
|
||||
def liftOptions[U](featureIdValueMap: Map[Long, Option[U]]): Map[Long, U] =
|
||||
featureIdValueMap
|
||||
.flatMap {
|
||||
case (id, valueOpt) =>
|
||||
valueOpt.map { value => (id, value) }
|
||||
}
|
||||
|
||||
val timestampFeature: Feature[JLong] = SharedFeatures.TIMESTAMP
|
||||
|
||||
/**
|
||||
* Builds all valid aggregation keys (for the output store) from
|
||||
* a datarecord and a spec listing the keys to aggregate. There
|
||||
* can be multiple aggregation keys generated from a single data
|
||||
* record when grouping by sparse binary features, for which multiple
|
||||
* values can be set within the data record.
|
||||
*
|
||||
* @param dataRecord Data record to read values for key features from
|
||||
* @return A set of AggregationKeys encoding the values of all keys
|
||||
*/
|
||||
def buildAggregationKeys(
|
||||
dataRecord: DataRecord,
|
||||
keysToAggregate: Set[Feature[_]]
|
||||
): Set[AggregationKey] = {
|
||||
val discreteAggregationKeys = getKeyFeatureIdValues[Long](
|
||||
dataRecord,
|
||||
keysToAggregate,
|
||||
FeatureType.DISCRETE
|
||||
).toMap
|
||||
|
||||
val textAggregationKeys = getKeyFeatureIdValues[String](
|
||||
dataRecord,
|
||||
keysToAggregate,
|
||||
FeatureType.STRING
|
||||
).toMap
|
||||
|
||||
val sparseBinaryIdValues = getKeyFeatureIdValues[JSet[String]](
|
||||
dataRecord,
|
||||
keysToAggregate,
|
||||
FeatureType.SPARSE_BINARY
|
||||
).map {
|
||||
case (id, values) =>
|
||||
(
|
||||
id,
|
||||
values
|
||||
.map(_.asScala.toSet)
|
||||
.getOrElse(Set.empty[String])
|
||||
)
|
||||
}.toList
|
||||
|
||||
if (checkIfAllKeysExist(discreteAggregationKeys) &&
|
||||
checkIfAllKeysExist(textAggregationKeys)) {
|
||||
if (sparseBinaryIdValues.nonEmpty) {
|
||||
sparseBinaryPermutations(sparseBinaryIdValues).map { sparseBinaryTextKeys =>
|
||||
AggregationKey(
|
||||
discreteFeaturesById = liftOptions(discreteAggregationKeys),
|
||||
textFeaturesById = liftOptions(textAggregationKeys) ++ sparseBinaryTextKeys
|
||||
)
|
||||
}
|
||||
} else {
|
||||
Set(
|
||||
AggregationKey(
|
||||
discreteFeaturesById = liftOptions(discreteAggregationKeys),
|
||||
textFeaturesById = liftOptions(textAggregationKeys)
|
||||
)
|
||||
)
|
||||
}
|
||||
} else Set.empty[AggregationKey]
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Specifies one or more related aggregate(s) to compute in the summingbird job.
|
||||
*
|
||||
* @param inputSource Source to compute this aggregate over
|
||||
* @param preTransforms Sequence of [[com.twitter.ml.api.RichITransform]] that transform
|
||||
* data records pre-aggregation (e.g. discretization, renaming)
|
||||
* @param samplingTransformOpt Optional [[OneToSomeTransform]] that transform data
|
||||
* record to optional data record (e.g. for sampling) before aggregation
|
||||
* @param aggregatePrefix Prefix to use for naming resultant aggregate features
|
||||
* @param keysToAggregate Features to group by when computing the aggregates
|
||||
* (e.g. USER_ID, AUTHOR_ID)
|
||||
* @param featuresToAggregate Features to aggregate (e.g. blender_score or is_photo)
|
||||
* @param labels Labels to cross the features with to make pair features, if any.
|
||||
* use Label.All if you don't want to cross with a label.
|
||||
* @param metrics Aggregation metrics to compute (e.g. count, mean)
|
||||
* @param halfLives Half lives to use for the aggregations, to be crossed with the above.
|
||||
* use Duration.Top for "forever" aggregations over an infinite time window (no decay).
|
||||
* @param outputStore Store to output this aggregate to
|
||||
* @param includeAnyFeature Aggregate label counts for any feature value
|
||||
* @param includeAnyLabel Aggregate feature counts for any label value (e.g. all impressions)
|
||||
*
|
||||
* The overall config for the summingbird job consists of a list of "AggregateGroup"
|
||||
* case class objects, which get translated into strongly typed "TypedAggregateGroup"
|
||||
* case class objects. A single TypedAggregateGroup always groups input data records from
|
||||
* ''inputSource'' by a single set of aggregation keys (''featuresToAggregate'').
|
||||
* Within these groups, we perform a comprehensive cross of:
|
||||
*
|
||||
* ''featuresToAggregate'' x ''labels'' x ''metrics'' x ''halfLives''
|
||||
*
|
||||
* All the resultant aggregate features are assigned a human-readable feature name
|
||||
* beginning with ''aggregatePrefix'', and are written to DataRecords that get
|
||||
* aggregated and written to the store specified by ''outputStore''.
|
||||
*
|
||||
* Illustrative example. Suppose we define our spec as follows:
|
||||
*
|
||||
* TypedAggregateGroup(
|
||||
* inputSource = "timelines_recap_daily",
|
||||
* aggregatePrefix = "user_author_aggregate",
|
||||
* keysToAggregate = Set(USER_ID, AUTHOR_ID),
|
||||
* featuresToAggregate = Set(RecapFeatures.TEXT_SCORE, RecapFeatures.BLENDER_SCORE),
|
||||
* labels = Set(RecapFeatures.IS_FAVORITED, RecapFeatures.IS_REPLIED),
|
||||
* metrics = Set(CountMetric, MeanMetric),
|
||||
* halfLives = Set(7.Days, 30.Days),
|
||||
* outputStore = "user_author_aggregate_store"
|
||||
* )
|
||||
*
|
||||
* This will process data records from the source named "timelines_recap_daily"
|
||||
* (see AggregateSource.scala for more details on how to add your own source)
|
||||
* It will produce a total of 2x2x2x2 = 16 aggregation features, named like:
|
||||
*
|
||||
* user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.count.7days
|
||||
* user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.count.30days
|
||||
* user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.mean.7days
|
||||
*
|
||||
* ... (and so on)
|
||||
*
|
||||
* and all the result features will be stored in DataRecords, summed up, and written
|
||||
* to the output store defined by the name "user_author_aggregate_store".
|
||||
* (see AggregateStore.scala for details on how to add your own store).
|
||||
*
|
||||
* If you do not want a full cross, split up your config into multiple TypedAggregateGroup
|
||||
* objects. Splitting is strongly advised to avoid blowing up and creating invalid
|
||||
* or unnecessary combinations of aggregate features (note that some combinations
|
||||
* are useless or invalid e.g. computing the mean of a binary feature). Splitting
|
||||
* also does not cost anything in terms of real-time performance, because all
|
||||
* Aggregate objects in the master spec that share the same ''keysToAggregate'', the
|
||||
* same ''inputSource'' and the same ''outputStore'' are grouped by the summingbird
|
||||
* job logic and stored into a single DataRecord in the output store. Overlapping
|
||||
* aggregates will also automatically be deduplicated so don't worry about overlaps.
|
||||
*/
|
||||
case class TypedAggregateGroup[T](
|
||||
inputSource: AggregateSource,
|
||||
aggregatePrefix: String,
|
||||
keysToAggregate: Set[Feature[_]],
|
||||
featuresToAggregate: Set[Feature[T]],
|
||||
labels: Set[_ <: Feature[JBoolean]],
|
||||
metrics: Set[AggregationMetric[T, _]],
|
||||
halfLives: Set[Duration],
|
||||
outputStore: AggregateStore,
|
||||
preTransforms: Seq[OneToSomeTransform] = Seq.empty,
|
||||
includeAnyFeature: Boolean = true,
|
||||
includeAnyLabel: Boolean = true,
|
||||
aggExclusionRegex: Seq[String] = Seq.empty) {
|
||||
import TypedAggregateGroup._
|
||||
|
||||
val compiledRegexes = aggExclusionRegex.map(new Regex(_))
|
||||
|
||||
// true if should drop, false if should keep
|
||||
def filterOutAggregateFeature(
|
||||
feature: PrecomputedAggregateDescriptor[_],
|
||||
regexes: Seq[Regex]
|
||||
): Boolean = {
|
||||
if (regexes.nonEmpty)
|
||||
feature.outputFeatures.exists { feature =>
|
||||
regexes.exists { re => re.findFirstMatchIn(feature.getDenseFeatureName).nonEmpty }
|
||||
}
|
||||
else false
|
||||
}
|
||||
|
||||
def buildAggregationKeys(
|
||||
dataRecord: DataRecord
|
||||
): Set[AggregationKey] = {
|
||||
TypedAggregateGroup.buildAggregationKeys(dataRecord, keysToAggregate)
|
||||
}
|
||||
|
||||
/**
|
||||
* This val precomputes descriptors for all individual aggregates in this group
|
||||
* (of type ''AggregateFeature''). Also precompute hashes of all aggregation
|
||||
* "output" features generated by these operators for faster
|
||||
* run-time performance (this turns out to be a primary CPU bottleneck).
|
||||
* Ex: for the mean operator, "sum" and "count" are output features
|
||||
*/
|
||||
val individualAggregateDescriptors: Set[PrecomputedAggregateDescriptor[T]] = {
|
||||
/*
|
||||
* By default, in additional to all feature-label crosses, also
|
||||
* compute in aggregates over each feature and label without crossing
|
||||
*/
|
||||
val labelOptions = labels.map(Option(_)) ++
|
||||
(if (includeAnyLabel) Set(None) else Set.empty)
|
||||
val featureOptions = featuresToAggregate.map(Option(_)) ++
|
||||
(if (includeAnyFeature) Set(None) else Set.empty)
|
||||
for {
|
||||
feature <- featureOptions
|
||||
label <- labelOptions
|
||||
metric <- metrics
|
||||
halfLife <- halfLives
|
||||
} yield {
|
||||
val query = AggregateFeature[T](aggregatePrefix, feature, label, halfLife)
|
||||
|
||||
val aggregateOutputFeatures = metric.getOutputFeatures(query)
|
||||
val aggregateOutputFeatureIds = metric.getOutputFeatureIds(query)
|
||||
PrecomputedAggregateDescriptor(
|
||||
query,
|
||||
metric,
|
||||
aggregateOutputFeatures,
|
||||
aggregateOutputFeatureIds
|
||||
)
|
||||
}
|
||||
}.filterNot(filterOutAggregateFeature(_, compiledRegexes))
|
||||
|
||||
/* Precomputes a map from all generated aggregate feature ids to their half lives. */
|
||||
val continuousFeatureIdsToHalfLives: Map[Long, Duration] =
|
||||
individualAggregateDescriptors.flatMap { descriptor =>
|
||||
descriptor.outputFeatures
|
||||
.flatMap { feature =>
|
||||
if (feature.getFeatureType() == FeatureType.CONTINUOUS) {
|
||||
Try(feature.asInstanceOf[Feature[JDouble]]).toOption
|
||||
.map(feature => (feature.getFeatureId(), descriptor.query.halfLife))
|
||||
} else None
|
||||
}
|
||||
}.toMap
|
||||
|
||||
/*
|
||||
* Sparse binary keys become individual string keys in the output.
|
||||
* e.g. group by "words.in.tweet", output key: "words.in.tweet.member"
|
||||
*/
|
||||
val allOutputKeys: Set[Feature[_]] = keysToAggregate.map { key =>
|
||||
if (key.getFeatureType == FeatureType.SPARSE_BINARY) sparseFeature(key)
|
||||
else key
|
||||
}
|
||||
|
||||
val allOutputFeatures: Set[Feature[_]] = individualAggregateDescriptors.flatMap {
|
||||
case PrecomputedAggregateDescriptor(
|
||||
query,
|
||||
metric,
|
||||
outputFeatures,
|
||||
outputFeatureIds
|
||||
) =>
|
||||
outputFeatures
|
||||
}
|
||||
|
||||
val aggregateContext: FeatureContext = new FeatureContext(allOutputFeatures.toList.asJava)
|
||||
|
||||
/**
|
||||
* Adds all aggregates in this group found in the two input data records
|
||||
* into a result, mutating the result. Uses a while loop for an
|
||||
* approximately 10% gain in speed over a for comprehension.
|
||||
*
|
||||
* WARNING: mutates ''result''
|
||||
*
|
||||
* @param result The output data record to mutate
|
||||
* @param left The left data record to add
|
||||
* @param right The right data record to add
|
||||
*/
|
||||
def mutatePlus(result: DataRecord, left: DataRecord, right: DataRecord): Unit = {
|
||||
val featureIterator = individualAggregateDescriptors.iterator
|
||||
while (featureIterator.hasNext) {
|
||||
val descriptor = featureIterator.next
|
||||
descriptor.metric.mutatePlus(
|
||||
result,
|
||||
left,
|
||||
right,
|
||||
descriptor.query,
|
||||
Some(descriptor.outputFeatureIds)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply preTransforms sequentially. If any transform results in a dropped (None)
|
||||
* DataRecord, then entire tranform sequence will result in a dropped DataRecord.
|
||||
* Note that preTransforms are order-dependent.
|
||||
*/
|
||||
private[this] def sequentiallyTransform(dataRecord: DataRecord): Option[DataRecord] = {
|
||||
val recordOpt = Option(new DataRecord(dataRecord))
|
||||
preTransforms.foldLeft(recordOpt) {
|
||||
case (Some(previousRecord), preTransform) =>
|
||||
preTransform(previousRecord)
|
||||
case _ => Option.empty[DataRecord]
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a data record, apply transforms and fetch the incremental contributions to
|
||||
* each configured aggregate from this data record, and store these in an output data record.
|
||||
*
|
||||
* @param dataRecord Input data record to aggregate.
|
||||
* @return A set of tuples (AggregationKey, DataRecord) whose first entry is an
|
||||
* AggregationKey indicating what keys we're grouping by, and whose second entry
|
||||
* is an output data record with incremental contributions to the aggregate value(s)
|
||||
*/
|
||||
def computeAggregateKVPairs(dataRecord: DataRecord): Set[(AggregationKey, DataRecord)] = {
|
||||
sequentiallyTransform(dataRecord)
|
||||
.flatMap { dataRecord =>
|
||||
val aggregationKeys = buildAggregationKeys(dataRecord)
|
||||
val increment = new DataRecord
|
||||
|
||||
val isNonEmptyIncrement = individualAggregateDescriptors
|
||||
.map { descriptor =>
|
||||
descriptor.metric.setIncrement(
|
||||
output = increment,
|
||||
input = dataRecord,
|
||||
query = descriptor.query,
|
||||
timestampFeature = inputSource.timestampFeature,
|
||||
aggregateOutputs = Some(descriptor.outputFeatureIds)
|
||||
)
|
||||
}
|
||||
.exists(identity)
|
||||
|
||||
if (isNonEmptyIncrement) {
|
||||
SRichDataRecord(increment).setFeatureValue(
|
||||
timestampFeature,
|
||||
getTimestamp(dataRecord, inputSource.timestampFeature)
|
||||
)
|
||||
Some(aggregationKeys.map(key => (key, increment)))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
.getOrElse(Set.empty[(AggregationKey, DataRecord)])
|
||||
}
|
||||
|
||||
def outputFeaturesToRenamedOutputFeatures(prefix: String): Map[Feature[_], Feature[_]] = {
|
||||
require(prefix.nonEmpty)
|
||||
|
||||
allOutputFeatures.map { feature =>
|
||||
if (feature.isSetFeatureName) {
|
||||
val renamedFeatureName = prefix + feature.getDenseFeatureName
|
||||
val personalDataTypes =
|
||||
if (feature.getPersonalDataTypes.isPresent) feature.getPersonalDataTypes.get()
|
||||
else null
|
||||
|
||||
val renamedFeature = feature.getFeatureType match {
|
||||
case FeatureType.BINARY =>
|
||||
new Feature.Binary(renamedFeatureName, personalDataTypes)
|
||||
case FeatureType.DISCRETE =>
|
||||
new Feature.Discrete(renamedFeatureName, personalDataTypes)
|
||||
case FeatureType.STRING =>
|
||||
new Feature.Text(renamedFeatureName, personalDataTypes)
|
||||
case FeatureType.CONTINUOUS =>
|
||||
new Feature.Continuous(renamedFeatureName, personalDataTypes)
|
||||
case FeatureType.SPARSE_BINARY =>
|
||||
new Feature.SparseBinary(renamedFeatureName, personalDataTypes)
|
||||
case FeatureType.SPARSE_CONTINUOUS =>
|
||||
new Feature.SparseContinuous(renamedFeatureName, personalDataTypes)
|
||||
}
|
||||
feature -> renamedFeature
|
||||
} else {
|
||||
feature -> feature
|
||||
}
|
||||
}.toMap
|
||||
}
|
||||
}
|
|
@ -0,0 +1,122 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
|
||||
|
||||
import com.twitter.algebird.ScMapMonoid
|
||||
import com.twitter.algebird.Semigroup
|
||||
import com.twitter.ml.api._
|
||||
import com.twitter.ml.api.constant.SharedFeatures
|
||||
import com.twitter.ml.api.DataRecord
|
||||
import com.twitter.ml.api.Feature
|
||||
import com.twitter.ml.api.FeatureType
|
||||
import com.twitter.ml.api.util.SRichDataRecord
|
||||
import java.lang.{Long => JLong}
|
||||
import scala.collection.{Map => ScMap}
|
||||
|
||||
object Utils {
|
||||
val dataRecordMerger: DataRecordMerger = new DataRecordMerger
|
||||
def EmptyDataRecord: DataRecord = new DataRecord()
|
||||
|
||||
private val random = scala.util.Random
|
||||
private val keyedDataRecordMapMonoid = {
|
||||
val dataRecordMergerSg = new Semigroup[DataRecord] {
|
||||
override def plus(x: DataRecord, y: DataRecord): DataRecord = {
|
||||
dataRecordMerger.merge(x, y)
|
||||
x
|
||||
}
|
||||
}
|
||||
new ScMapMonoid[Long, DataRecord]()(dataRecordMergerSg)
|
||||
}
|
||||
|
||||
def keyFromLong(record: DataRecord, feature: Feature[JLong]): Long =
|
||||
SRichDataRecord(record).getFeatureValue(feature).longValue
|
||||
|
||||
def keyFromString(record: DataRecord, feature: Feature[String]): Long =
|
||||
try {
|
||||
SRichDataRecord(record).getFeatureValue(feature).toLong
|
||||
} catch {
|
||||
case _: NumberFormatException => 0L
|
||||
}
|
||||
|
||||
def keyFromHash(record: DataRecord, feature: Feature[String]): Long =
|
||||
SRichDataRecord(record).getFeatureValue(feature).hashCode.toLong
|
||||
|
||||
def extractSecondary[T](
|
||||
record: DataRecord,
|
||||
secondaryKey: Feature[T],
|
||||
shouldHash: Boolean = false
|
||||
): Long = secondaryKey.getFeatureType match {
|
||||
case FeatureType.STRING =>
|
||||
if (shouldHash) keyFromHash(record, secondaryKey.asInstanceOf[Feature[String]])
|
||||
else keyFromString(record, secondaryKey.asInstanceOf[Feature[String]])
|
||||
case FeatureType.DISCRETE => keyFromLong(record, secondaryKey.asInstanceOf[Feature[JLong]])
|
||||
case f => throw new IllegalArgumentException(s"Feature type $f is not supported.")
|
||||
}
|
||||
|
||||
def mergeKeyedRecordOpts(args: Option[KeyedRecord]*): Option[KeyedRecord] = {
|
||||
val keyedRecords = args.flatten
|
||||
if (keyedRecords.isEmpty) {
|
||||
None
|
||||
} else {
|
||||
val keys = keyedRecords.map(_.aggregateType)
|
||||
require(keys.toSet.size == 1, "All merged records must have the same aggregate key.")
|
||||
val mergedRecord = mergeRecords(keyedRecords.map(_.record): _*)
|
||||
Some(KeyedRecord(keys.head, mergedRecord))
|
||||
}
|
||||
}
|
||||
|
||||
private def mergeRecords(args: DataRecord*): DataRecord =
|
||||
if (args.isEmpty) EmptyDataRecord
|
||||
else {
|
||||
// can just do foldLeft(new DataRecord) for both cases, but try reusing the EmptyDataRecord singleton as much as possible
|
||||
args.tail.foldLeft(args.head) { (merged, record) =>
|
||||
dataRecordMerger.merge(merged, record)
|
||||
merged
|
||||
}
|
||||
}
|
||||
|
||||
def mergeKeyedRecordMapOpts(
|
||||
opt1: Option[KeyedRecordMap],
|
||||
opt2: Option[KeyedRecordMap],
|
||||
maxSize: Int = Int.MaxValue
|
||||
): Option[KeyedRecordMap] = {
|
||||
if (opt1.isEmpty && opt2.isEmpty) {
|
||||
None
|
||||
} else {
|
||||
val keys = Seq(opt1, opt2).flatten.map(_.aggregateType)
|
||||
require(keys.toSet.size == 1, "All merged records must have the same aggregate key.")
|
||||
val mergedRecordMap = mergeMapOpts(opt1.map(_.recordMap), opt2.map(_.recordMap), maxSize)
|
||||
Some(KeyedRecordMap(keys.head, mergedRecordMap))
|
||||
}
|
||||
}
|
||||
|
||||
private def mergeMapOpts(
|
||||
opt1: Option[ScMap[Long, DataRecord]],
|
||||
opt2: Option[ScMap[Long, DataRecord]],
|
||||
maxSize: Int = Int.MaxValue
|
||||
): ScMap[Long, DataRecord] = {
|
||||
require(maxSize >= 0)
|
||||
val keySet = opt1.map(_.keySet).getOrElse(Set.empty) ++ opt2.map(_.keySet).getOrElse(Set.empty)
|
||||
val totalSize = keySet.size
|
||||
val rate = if (totalSize <= maxSize) 1.0 else maxSize.toDouble / totalSize
|
||||
val prunedOpt1 = opt1.map(downsample(_, rate))
|
||||
val prunedOpt2 = opt2.map(downsample(_, rate))
|
||||
Seq(prunedOpt1, prunedOpt2).flatten
|
||||
.foldLeft(keyedDataRecordMapMonoid.zero)(keyedDataRecordMapMonoid.plus)
|
||||
}
|
||||
|
||||
def downsample[K, T](m: ScMap[K, T], samplingRate: Double): ScMap[K, T] = {
|
||||
if (samplingRate >= 1.0) {
|
||||
m
|
||||
} else if (samplingRate <= 0) {
|
||||
Map.empty
|
||||
} else {
|
||||
m.filter {
|
||||
case (key, _) =>
|
||||
// It is important that the same user with the same sampling rate be deterministically
|
||||
// selected or rejected. Otherwise, mergeMapOpts will choose different keys for the
|
||||
// two input maps and their union will be larger than the limit we want.
|
||||
random.setSeed((key.hashCode, samplingRate.hashCode).hashCode)
|
||||
random.nextDouble < samplingRate
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,165 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
|
||||
|
||||
import com.twitter.algebird.DecayedValue
|
||||
import com.twitter.algebird.DecayedValueMonoid
|
||||
import com.twitter.algebird.Monoid
|
||||
import com.twitter.ml.api._
|
||||
import com.twitter.ml.api.constant.SharedFeatures
|
||||
import com.twitter.ml.api.util.FDsl._
|
||||
import com.twitter.ml.api.util.SRichDataRecord
|
||||
import com.twitter.summingbird.batch.BatchID
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregateFeature
|
||||
import com.twitter.util.Duration
|
||||
import java.lang.{Double => JDouble}
|
||||
import java.lang.{Long => JLong}
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.collection.mutable
|
||||
import java.{util => ju}
|
||||
|
||||
object AggregatesV2Adapter {
|
||||
type AggregatesV2Tuple = (AggregationKey, (BatchID, DataRecord))
|
||||
|
||||
val Epsilon: Double = 1e-6
|
||||
val decayedValueMonoid: Monoid[DecayedValue] = DecayedValueMonoid(Epsilon)
|
||||
|
||||
/*
|
||||
* Decays the storedValue from timestamp -> sourceVersion
|
||||
*
|
||||
* @param storedValue value read from the aggregates v2 output store
|
||||
* @param timestamp timestamp corresponding to store value
|
||||
* @param sourceVersion timestamp of version to decay all values to uniformly
|
||||
* @param halfLife Half life duration to use for applying decay
|
||||
*
|
||||
* By applying this function, the feature values for all users are decayed
|
||||
* to sourceVersion. This is important to ensure that a user whose aggregates
|
||||
* were updated long in the past does not have an artifically inflated count
|
||||
* compared to one whose aggregates were updated (and hence decayed) more recently.
|
||||
*/
|
||||
def decayValueToSourceVersion(
|
||||
storedValue: Double,
|
||||
timestamp: Long,
|
||||
sourceVersion: Long,
|
||||
halfLife: Duration
|
||||
): Double =
|
||||
if (timestamp > sourceVersion) {
|
||||
storedValue
|
||||
} else {
|
||||
decayedValueMonoid
|
||||
.plus(
|
||||
DecayedValue.build(storedValue, timestamp, halfLife.inMilliseconds),
|
||||
DecayedValue.build(0, sourceVersion, halfLife.inMilliseconds)
|
||||
)
|
||||
.value
|
||||
}
|
||||
|
||||
/*
|
||||
* Decays all the aggregate features occurring in the ''inputRecord''
|
||||
* to a given timestamp, and mutates the ''outputRecord'' accordingly.
|
||||
* Note that inputRecord and outputRecord can be the same if you want
|
||||
* to mutate the input in place, the function does this correctly.
|
||||
*
|
||||
* @param inputRecord Input record to get features from
|
||||
* @param aggregates Aggregates to decay
|
||||
* @param decayTo Timestamp to decay to
|
||||
* @param trimThreshold Drop features below this trim threshold
|
||||
* @param outputRecord Output record to mutate
|
||||
* @return the mutated outputRecord
|
||||
*/
|
||||
def mutateDecay(
|
||||
inputRecord: DataRecord,
|
||||
aggregateFeaturesAndHalfLives: List[(Feature[_], Duration)],
|
||||
decayTo: Long,
|
||||
trimThreshold: Double,
|
||||
outputRecord: DataRecord
|
||||
): DataRecord = {
|
||||
val timestamp = inputRecord.getFeatureValue(SharedFeatures.TIMESTAMP).toLong
|
||||
|
||||
aggregateFeaturesAndHalfLives.foreach {
|
||||
case (aggregateFeature: Feature[_], halfLife: Duration) =>
|
||||
if (aggregateFeature.getFeatureType() == FeatureType.CONTINUOUS) {
|
||||
val continuousFeature = aggregateFeature.asInstanceOf[Feature[JDouble]]
|
||||
if (inputRecord.hasFeature(continuousFeature)) {
|
||||
val storedValue = inputRecord.getFeatureValue(continuousFeature).toDouble
|
||||
val decayedValue = decayValueToSourceVersion(storedValue, timestamp, decayTo, halfLife)
|
||||
if (math.abs(decayedValue) > trimThreshold) {
|
||||
outputRecord.setFeatureValue(continuousFeature, decayedValue)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Update timestamp to version (now that we've decayed all aggregates) */
|
||||
outputRecord.setFeatureValue(SharedFeatures.TIMESTAMP, decayTo)
|
||||
|
||||
outputRecord
|
||||
}
|
||||
}
|
||||
|
||||
class AggregatesV2Adapter(
|
||||
aggregates: Set[TypedAggregateGroup[_]],
|
||||
sourceVersion: Long,
|
||||
trimThreshold: Double)
|
||||
extends IRecordOneToManyAdapter[AggregatesV2Adapter.AggregatesV2Tuple] {
|
||||
|
||||
import AggregatesV2Adapter._
|
||||
|
||||
val keyFeatures: List[Feature[_]] = aggregates.flatMap(_.allOutputKeys).toList
|
||||
val aggregateFeatures: List[Feature[_]] = aggregates.flatMap(_.allOutputFeatures).toList
|
||||
val timestampFeatures: List[Feature[JLong]] = List(SharedFeatures.TIMESTAMP)
|
||||
val allFeatures: List[Feature[_]] = keyFeatures ++ aggregateFeatures ++ timestampFeatures
|
||||
|
||||
val featureContext: FeatureContext = new FeatureContext(allFeatures.asJava)
|
||||
|
||||
override def getFeatureContext: FeatureContext = featureContext
|
||||
|
||||
val aggregateFeaturesAndHalfLives: List[(Feature[_$3], Duration) forSome { type _$3 }] =
|
||||
aggregateFeatures.map { aggregateFeature: Feature[_] =>
|
||||
val halfLife = AggregateFeature.parseHalfLife(aggregateFeature)
|
||||
(aggregateFeature, halfLife)
|
||||
}
|
||||
|
||||
override def adaptToDataRecords(tuple: AggregatesV2Tuple): ju.List[DataRecord] = tuple match {
|
||||
case (key: AggregationKey, (batchId: BatchID, record: DataRecord)) => {
|
||||
val resultRecord = new SRichDataRecord(new DataRecord, featureContext)
|
||||
|
||||
val itr = resultRecord.continuousFeaturesIterator()
|
||||
val featuresToClear = mutable.Set[Feature[JDouble]]()
|
||||
while (itr.moveNext()) {
|
||||
val nextFeature = itr.getFeature
|
||||
if (!aggregateFeatures.contains(nextFeature)) {
|
||||
featuresToClear += nextFeature
|
||||
}
|
||||
}
|
||||
|
||||
featuresToClear.foreach(resultRecord.clearFeature)
|
||||
|
||||
keyFeatures.foreach { keyFeature: Feature[_] =>
|
||||
if (keyFeature.getFeatureType == FeatureType.DISCRETE) {
|
||||
resultRecord.setFeatureValue(
|
||||
keyFeature.asInstanceOf[Feature[JLong]],
|
||||
key.discreteFeaturesById(keyFeature.getDenseFeatureId)
|
||||
)
|
||||
} else if (keyFeature.getFeatureType == FeatureType.STRING) {
|
||||
resultRecord.setFeatureValue(
|
||||
keyFeature.asInstanceOf[Feature[String]],
|
||||
key.textFeaturesById(keyFeature.getDenseFeatureId)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
if (record.hasFeature(SharedFeatures.TIMESTAMP)) {
|
||||
mutateDecay(
|
||||
record,
|
||||
aggregateFeaturesAndHalfLives,
|
||||
sourceVersion,
|
||||
trimThreshold,
|
||||
resultRecord)
|
||||
List(resultRecord.getRecord).asJava
|
||||
} else {
|
||||
List.empty[DataRecord].asJava
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,171 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
|
||||
|
||||
import com.twitter.bijection.Injection
|
||||
import com.twitter.bijection.thrift.CompactThriftCodec
|
||||
import com.twitter.ml.api.AdaptedFeatureSource
|
||||
import com.twitter.ml.api.DataRecord
|
||||
import com.twitter.ml.api.IRecordOneToManyAdapter
|
||||
import com.twitter.ml.api.TypedFeatureSource
|
||||
import com.twitter.scalding.DateRange
|
||||
import com.twitter.scalding.RichDate
|
||||
import com.twitter.scalding.TypedPipe
|
||||
import com.twitter.scalding.commons.source.VersionedKeyValSource
|
||||
import com.twitter.scalding.commons.tap.VersionedTap.TapMode
|
||||
import com.twitter.summingbird.batch.BatchID
|
||||
import com.twitter.summingbird_internal.bijection.BatchPairImplicits
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKeyInjection
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
|
||||
import org.apache.hadoop.mapred.JobConf
|
||||
import scala.collection.JavaConverters._
|
||||
import AggregatesV2Adapter._
|
||||
|
||||
object AggregatesV2AdaptedSource {
|
||||
val DefaultTrimThreshold = 0
|
||||
}
|
||||
|
||||
trait AggregatesV2AdaptedSource extends AggregatesV2AdaptedSourceBase[DataRecord] {
|
||||
override def storageFormatCodec: Injection[DataRecord, Array[Byte]] =
|
||||
CompactThriftCodec[DataRecord]
|
||||
override def toDataRecord(v: DataRecord): DataRecord = v
|
||||
}
|
||||
|
||||
trait AggregatesV2AdaptedSourceBase[StorageFormat]
|
||||
extends TypedFeatureSource[AggregatesV2Tuple]
|
||||
with AdaptedFeatureSource[AggregatesV2Tuple]
|
||||
with BatchPairImplicits {
|
||||
|
||||
/* Output root path of aggregates v2 job, excluding store name and version */
|
||||
def rootPath: String
|
||||
|
||||
/* Name of store under root path to read */
|
||||
def storeName: String
|
||||
|
||||
// max bijection failures
|
||||
def maxFailures: Int = 0
|
||||
|
||||
/* Aggregate config used to generate above output */
|
||||
def aggregates: Set[TypedAggregateGroup[_]]
|
||||
|
||||
/* trimThreshold Trim all aggregates below a certain threshold to save memory */
|
||||
def trimThreshold: Double
|
||||
|
||||
def toDataRecord(v: StorageFormat): DataRecord
|
||||
|
||||
def sourceVersionOpt: Option[Long]
|
||||
|
||||
def enableMostRecentBeforeSourceVersion: Boolean = false
|
||||
|
||||
implicit private val aggregationKeyInjection: Injection[AggregationKey, Array[Byte]] =
|
||||
AggregationKeyInjection
|
||||
implicit def storageFormatCodec: Injection[StorageFormat, Array[Byte]]
|
||||
|
||||
private def filteredAggregates = aggregates.filter(_.outputStore.name == storeName)
|
||||
def storePath: String = List(rootPath, storeName).mkString("/")
|
||||
|
||||
def mostRecentVkvs: VersionedKeyValSource[_, _] = {
|
||||
VersionedKeyValSource[AggregationKey, (BatchID, StorageFormat)](
|
||||
path = storePath,
|
||||
sourceVersion = None,
|
||||
maxFailures = maxFailures
|
||||
)
|
||||
}
|
||||
|
||||
private def availableVersions: Seq[Long] =
|
||||
mostRecentVkvs
|
||||
.getTap(TapMode.SOURCE)
|
||||
.getStore(new JobConf(true))
|
||||
.getAllVersions()
|
||||
.asScala
|
||||
.map(_.toLong)
|
||||
|
||||
private def mostRecentVersion: Long = {
|
||||
require(!availableVersions.isEmpty, s"$storeName has no available versions")
|
||||
availableVersions.max
|
||||
}
|
||||
|
||||
def versionToUse: Long =
|
||||
if (enableMostRecentBeforeSourceVersion) {
|
||||
sourceVersionOpt
|
||||
.map(sourceVersion =>
|
||||
availableVersions.filter(_ <= sourceVersion) match {
|
||||
case Seq() =>
|
||||
throw new IllegalArgumentException(
|
||||
"No version older than version: %s, available versions: %s"
|
||||
.format(sourceVersion, availableVersions)
|
||||
)
|
||||
case versionList => versionList.max
|
||||
})
|
||||
.getOrElse(mostRecentVersion)
|
||||
} else {
|
||||
sourceVersionOpt.getOrElse(mostRecentVersion)
|
||||
}
|
||||
|
||||
override lazy val adapter: IRecordOneToManyAdapter[AggregatesV2Tuple] =
|
||||
new AggregatesV2Adapter(filteredAggregates, versionToUse, trimThreshold)
|
||||
|
||||
override def getData: TypedPipe[AggregatesV2Tuple] = {
|
||||
val vkvsToUse: VersionedKeyValSource[AggregationKey, (BatchID, StorageFormat)] = {
|
||||
VersionedKeyValSource[AggregationKey, (BatchID, StorageFormat)](
|
||||
path = storePath,
|
||||
sourceVersion = Some(versionToUse),
|
||||
maxFailures = maxFailures
|
||||
)
|
||||
}
|
||||
TypedPipe.from(vkvsToUse).map {
|
||||
case (key, (batch, value)) => (key, (batch, toDataRecord(value)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Adapted data record feature source from aggregates v2 manhattan output
|
||||
* Params documented in parent trait.
|
||||
*/
|
||||
case class AggregatesV2FeatureSource(
|
||||
override val rootPath: String,
|
||||
override val storeName: String,
|
||||
override val aggregates: Set[TypedAggregateGroup[_]],
|
||||
override val trimThreshold: Double = 0,
|
||||
override val maxFailures: Int = 0,
|
||||
)(
|
||||
implicit val dateRange: DateRange)
|
||||
extends AggregatesV2AdaptedSource {
|
||||
|
||||
// Increment end date by 1 millisec since summingbird output for date D is stored at (D+1)T00
|
||||
override val sourceVersionOpt: Some[Long] = Some(dateRange.end.timestamp + 1)
|
||||
}
|
||||
|
||||
/*
|
||||
* Reads most recent available AggregatesV2FeatureSource.
|
||||
* There is no constraint on recency.
|
||||
* Params documented in parent trait.
|
||||
*/
|
||||
case class AggregatesV2MostRecentFeatureSource(
|
||||
override val rootPath: String,
|
||||
override val storeName: String,
|
||||
override val aggregates: Set[TypedAggregateGroup[_]],
|
||||
override val trimThreshold: Double = AggregatesV2AdaptedSource.DefaultTrimThreshold,
|
||||
override val maxFailures: Int = 0)
|
||||
extends AggregatesV2AdaptedSource {
|
||||
|
||||
override val sourceVersionOpt: None.type = None
|
||||
}
|
||||
|
||||
/*
|
||||
* Reads most recent available AggregatesV2FeatureSource
|
||||
* on or before the specified beforeDate.
|
||||
* Params documented in parent trait.
|
||||
*/
|
||||
case class AggregatesV2MostRecentFeatureSourceBeforeDate(
|
||||
override val rootPath: String,
|
||||
override val storeName: String,
|
||||
override val aggregates: Set[TypedAggregateGroup[_]],
|
||||
override val trimThreshold: Double = AggregatesV2AdaptedSource.DefaultTrimThreshold,
|
||||
beforeDate: RichDate,
|
||||
override val maxFailures: Int = 0)
|
||||
extends AggregatesV2AdaptedSource {
|
||||
|
||||
override val enableMostRecentBeforeSourceVersion = true
|
||||
override val sourceVersionOpt: Some[Long] = Some(beforeDate.timestamp + 1)
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/twitter/algebird:core",
|
||||
"3rdparty/jvm/com/twitter/algebird:util",
|
||||
"3rdparty/jvm/com/twitter/bijection:core",
|
||||
"3rdparty/jvm/com/twitter/bijection:json",
|
||||
"3rdparty/jvm/com/twitter/bijection:netty",
|
||||
"3rdparty/jvm/com/twitter/bijection:scrooge",
|
||||
"3rdparty/jvm/com/twitter/bijection:thrift",
|
||||
"3rdparty/jvm/com/twitter/bijection:util",
|
||||
"3rdparty/jvm/com/twitter/storehaus:algebra",
|
||||
"3rdparty/jvm/com/twitter/storehaus:core",
|
||||
"3rdparty/src/jvm/com/twitter/scalding:commons",
|
||||
"3rdparty/src/jvm/com/twitter/scalding:core",
|
||||
"3rdparty/src/jvm/com/twitter/scalding:date",
|
||||
"3rdparty/src/jvm/com/twitter/summingbird:batch",
|
||||
"3rdparty/src/jvm/com/twitter/summingbird:core",
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/java/com/twitter/ml/api/constant",
|
||||
"src/scala/com/twitter/ml/api:api-base",
|
||||
"src/scala/com/twitter/ml/api/util",
|
||||
"src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
"src/thrift/com/twitter/ml/api:data-java",
|
||||
"src/thrift/com/twitter/ml/api:interpretable-model-java",
|
||||
"src/thrift/com/twitter/summingbird",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
"timelines/data_processing/ml_util/aggregation_framework/metrics",
|
||||
"util/util-core:scala",
|
||||
],
|
||||
)
|
||||
|
||||
scala_library(
|
||||
name = "for-timelines",
|
||||
sources = [
|
||||
"CombineCountsPolicy.scala",
|
||||
"SparseBinaryMergePolicy.scala",
|
||||
],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/twitter/algebird:core",
|
||||
"3rdparty/jvm/com/twitter/algebird:util",
|
||||
"3rdparty/jvm/com/twitter/bijection:core",
|
||||
"3rdparty/jvm/com/twitter/bijection:json",
|
||||
"3rdparty/jvm/com/twitter/bijection:netty",
|
||||
"3rdparty/jvm/com/twitter/bijection:scrooge",
|
||||
"3rdparty/jvm/com/twitter/bijection:thrift",
|
||||
"3rdparty/jvm/com/twitter/bijection:util",
|
||||
"3rdparty/jvm/com/twitter/storehaus:algebra",
|
||||
"3rdparty/jvm/com/twitter/storehaus:core",
|
||||
"3rdparty/src/jvm/com/twitter/scalding:commons",
|
||||
"3rdparty/src/jvm/com/twitter/scalding:core",
|
||||
"3rdparty/src/jvm/com/twitter/scalding:date",
|
||||
"3rdparty/src/jvm/com/twitter/summingbird:batch",
|
||||
"3rdparty/src/jvm/com/twitter/summingbird:core",
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/java/com/twitter/ml/api/constant",
|
||||
"src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits",
|
||||
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
|
||||
"src/thrift/com/twitter/ml/api:data-java",
|
||||
"src/thrift/com/twitter/ml/api:interpretable-model-java",
|
||||
"src/thrift/com/twitter/summingbird",
|
||||
"timelines/data_processing/ml_util/aggregation_framework:common_types",
|
||||
"timelines/data_processing/ml_util/aggregation_framework/metrics",
|
||||
"util/util-core:scala",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,223 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting
|
||||
import com.twitter.ml.api.util.SRichDataRecord
|
||||
import com.twitter.ml.api.FeatureContext
|
||||
import com.twitter.ml.api._
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.TypedCountMetric
|
||||
import java.lang.{Double => JDouble}
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
case class CombinedFeatures(
|
||||
sum: Feature[JDouble],
|
||||
nonzero: Feature[JDouble],
|
||||
mean: Feature[JDouble],
|
||||
topK: Seq[Feature[JDouble]])
|
||||
|
||||
trait CombineCountsBase {
|
||||
val SparseSum = "sparse_sum"
|
||||
val SparseNonzero = "sparse_nonzero"
|
||||
val SparseMean = "sparse_mean"
|
||||
val SparseTop = "sparse_top"
|
||||
|
||||
def topK: Int
|
||||
def hardLimit: Option[Int]
|
||||
def precomputedCountFeatures: Seq[Feature[_]]
|
||||
|
||||
lazy val precomputedFeaturesMap: Map[Feature[_], CombinedFeatures] =
|
||||
precomputedCountFeatures.map { countFeature =>
|
||||
val derivedPersonalDataTypes =
|
||||
AggregationMetricCommon.derivePersonalDataTypes(Some(countFeature))
|
||||
val sum = new Feature.Continuous(
|
||||
countFeature.getDenseFeatureName + "." + SparseSum,
|
||||
derivedPersonalDataTypes)
|
||||
val nonzero = new Feature.Continuous(
|
||||
countFeature.getDenseFeatureName + "." + SparseNonzero,
|
||||
derivedPersonalDataTypes)
|
||||
val mean = new Feature.Continuous(
|
||||
countFeature.getDenseFeatureName + "." + SparseMean,
|
||||
derivedPersonalDataTypes)
|
||||
val topKFeatures = (1 to topK).map { k =>
|
||||
new Feature.Continuous(
|
||||
countFeature.getDenseFeatureName + "." + SparseTop + k,
|
||||
derivedPersonalDataTypes)
|
||||
}
|
||||
(countFeature, CombinedFeatures(sum, nonzero, mean, topKFeatures))
|
||||
}.toMap
|
||||
|
||||
lazy val outputFeaturesPostMerge: Set[Feature[JDouble]] =
|
||||
precomputedFeaturesMap.values.flatMap { combinedFeatures: CombinedFeatures =>
|
||||
Seq(
|
||||
combinedFeatures.sum,
|
||||
combinedFeatures.nonzero,
|
||||
combinedFeatures.mean
|
||||
) ++ combinedFeatures.topK
|
||||
}.toSet
|
||||
|
||||
private case class ComputedStats(sum: Double, nonzero: Double, mean: Double)
|
||||
|
||||
private def preComputeStats(featureValues: Seq[Double]): ComputedStats = {
|
||||
val (sum, nonzero) = featureValues.foldLeft((0.0, 0.0)) {
|
||||
case ((accSum, accNonzero), value) =>
|
||||
(accSum + value, if (value > 0.0) accNonzero + 1.0 else accNonzero)
|
||||
}
|
||||
ComputedStats(sum, nonzero, if (nonzero > 0.0) sum / nonzero else 0.0)
|
||||
}
|
||||
|
||||
private def computeSortedFeatureValues(featureValues: List[Double]): List[Double] =
|
||||
featureValues.sortBy(-_)
|
||||
|
||||
private def extractKth(sortedFeatureValues: Seq[Double], k: Int): Double =
|
||||
sortedFeatureValues
|
||||
.lift(k - 1)
|
||||
.getOrElse(0.0)
|
||||
|
||||
private def setContinuousFeatureIfNonZero(
|
||||
record: SRichDataRecord,
|
||||
feature: Feature[JDouble],
|
||||
value: Double
|
||||
): Unit =
|
||||
if (value != 0.0) {
|
||||
record.setFeatureValue(feature, value)
|
||||
}
|
||||
|
||||
def hydrateCountFeatures(
|
||||
richRecord: SRichDataRecord,
|
||||
features: Seq[Feature[_]],
|
||||
featureValuesMap: Map[Feature[_], List[Double]]
|
||||
): Unit =
|
||||
for {
|
||||
feature <- features
|
||||
featureValues <- featureValuesMap.get(feature)
|
||||
} {
|
||||
mergeRecordFromCountFeature(
|
||||
countFeature = feature,
|
||||
featureValues = featureValues,
|
||||
richInputRecord = richRecord
|
||||
)
|
||||
}
|
||||
|
||||
def mergeRecordFromCountFeature(
|
||||
richInputRecord: SRichDataRecord,
|
||||
countFeature: Feature[_],
|
||||
featureValues: List[Double]
|
||||
): Unit = {
|
||||
// In majority of calls to this method from timeline scorer
|
||||
// the featureValues list is empty.
|
||||
// While with empty list each operation will be not that expensive, these
|
||||
// small things do add up. By adding early stop here we can avoid sorting
|
||||
// empty list, allocating several options and making multiple function
|
||||
// calls. In addition to that, we won't iterate over [1, topK].
|
||||
if (featureValues.nonEmpty) {
|
||||
val sortedFeatureValues = hardLimit
|
||||
.map { limit =>
|
||||
computeSortedFeatureValues(featureValues).take(limit)
|
||||
}.getOrElse(computeSortedFeatureValues(featureValues)).toIndexedSeq
|
||||
val computed = preComputeStats(sortedFeatureValues)
|
||||
|
||||
val combinedFeatures = precomputedFeaturesMap(countFeature)
|
||||
setContinuousFeatureIfNonZero(
|
||||
richInputRecord,
|
||||
combinedFeatures.sum,
|
||||
computed.sum
|
||||
)
|
||||
setContinuousFeatureIfNonZero(
|
||||
richInputRecord,
|
||||
combinedFeatures.nonzero,
|
||||
computed.nonzero
|
||||
)
|
||||
setContinuousFeatureIfNonZero(
|
||||
richInputRecord,
|
||||
combinedFeatures.mean,
|
||||
computed.mean
|
||||
)
|
||||
(1 to topK).foreach { k =>
|
||||
setContinuousFeatureIfNonZero(
|
||||
richInputRecord,
|
||||
combinedFeatures.topK(k - 1),
|
||||
extractKth(sortedFeatureValues, k)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
object CombineCountsPolicy {
|
||||
def getCountFeatures(aggregateContext: FeatureContext): Seq[Feature[_]] =
|
||||
aggregateContext.getAllFeatures.asScala.toSeq
|
||||
.filter { feature =>
|
||||
feature.getFeatureType == FeatureType.CONTINUOUS &&
|
||||
feature.getDenseFeatureName.endsWith(TypedCountMetric[JDouble]().operatorName)
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
private[conversion] def getFeatureValues(
|
||||
dataRecordsWithCounts: List[DataRecord],
|
||||
countFeature: Feature[_]
|
||||
): List[Double] =
|
||||
dataRecordsWithCounts.map(new SRichDataRecord(_)).flatMap { record =>
|
||||
Option(record.getFeatureValue(countFeature)).map(_.asInstanceOf[JDouble].toDouble)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A merge policy that works whenever all aggregate features are
|
||||
* counts (computed using CountMetric), and typically represent
|
||||
* either impressions or engagements. For each such input count
|
||||
* feature, the policy outputs the following (3+k) derived features
|
||||
* into the output data record:
|
||||
*
|
||||
* Sum of the feature's value across all aggregate records
|
||||
* Number of aggregate records that have the feature set to non-zero
|
||||
* Mean of the feature's value across all aggregate records
|
||||
* topK values of the feature across all aggregate records
|
||||
*
|
||||
* @param topK topK values to compute
|
||||
* @param hardLimit when set, records are sorted and only the top values will be used for aggregation if
|
||||
* the number of records are higher than this hard limit.
|
||||
*/
|
||||
case class CombineCountsPolicy(
|
||||
override val topK: Int,
|
||||
aggregateContextToPrecompute: FeatureContext,
|
||||
override val hardLimit: Option[Int] = None)
|
||||
extends SparseBinaryMergePolicy
|
||||
with CombineCountsBase {
|
||||
import CombineCountsPolicy._
|
||||
override val precomputedCountFeatures: Seq[Feature[_]] = getCountFeatures(
|
||||
aggregateContextToPrecompute)
|
||||
|
||||
override def mergeRecord(
|
||||
mutableInputRecord: DataRecord,
|
||||
aggregateRecords: List[DataRecord],
|
||||
aggregateContext: FeatureContext
|
||||
): Unit = {
|
||||
// Assumes aggregateContext === aggregateContextToPrecompute
|
||||
mergeRecordFromCountFeatures(mutableInputRecord, aggregateRecords, precomputedCountFeatures)
|
||||
}
|
||||
|
||||
def defaultMergeRecord(
|
||||
mutableInputRecord: DataRecord,
|
||||
aggregateRecords: List[DataRecord]
|
||||
): Unit = {
|
||||
mergeRecordFromCountFeatures(mutableInputRecord, aggregateRecords, precomputedCountFeatures)
|
||||
}
|
||||
|
||||
def mergeRecordFromCountFeatures(
|
||||
mutableInputRecord: DataRecord,
|
||||
aggregateRecords: List[DataRecord],
|
||||
countFeatures: Seq[Feature[_]]
|
||||
): Unit = {
|
||||
val richInputRecord = new SRichDataRecord(mutableInputRecord)
|
||||
countFeatures.foreach { countFeature =>
|
||||
mergeRecordFromCountFeature(
|
||||
richInputRecord = richInputRecord,
|
||||
countFeature = countFeature,
|
||||
featureValues = getFeatureValues(aggregateRecords, countFeature)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
override def aggregateFeaturesPostMerge(aggregateContext: FeatureContext): Set[Feature[_]] =
|
||||
outputFeaturesPostMerge.map(_.asInstanceOf[Feature[_]])
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
|
||||
|
||||
import com.twitter.bijection.Injection
|
||||
import com.twitter.ml.api._
|
||||
import com.twitter.ml.api.util.SRichDataRecord
|
||||
import com.twitter.scalding.TypedPipe
|
||||
|
||||
object DataSetPipeSketchJoin {
|
||||
val DefaultSketchNumReducers = 500
|
||||
val dataRecordMerger: DataRecordMerger = new DataRecordMerger
|
||||
implicit val str2Byte: String => Array[Byte] =
|
||||
implicitly[Injection[String, Array[Byte]]].toFunction
|
||||
|
||||
/* Computes a left sketch join on a set of skewed keys. */
|
||||
def apply(
|
||||
inputDataSet: DataSetPipe,
|
||||
skewedJoinKeys: Product,
|
||||
joinFeaturesDataSet: DataSetPipe,
|
||||
sketchNumReducers: Int = DefaultSketchNumReducers
|
||||
): DataSetPipe = {
|
||||
val joinKeyList = skewedJoinKeys.productIterator.toList.asInstanceOf[List[Feature[_]]]
|
||||
|
||||
def makeKey(record: DataRecord): String =
|
||||
joinKeyList
|
||||
.map(SRichDataRecord(record).getFeatureValue(_))
|
||||
.toString
|
||||
|
||||
def byKey(pipe: DataSetPipe): TypedPipe[(String, DataRecord)] =
|
||||
pipe.records.map(record => (makeKey(record), record))
|
||||
|
||||
val joinedRecords = byKey(inputDataSet)
|
||||
.sketch(sketchNumReducers)
|
||||
.leftJoin(byKey(joinFeaturesDataSet))
|
||||
.values
|
||||
.map {
|
||||
case (inputRecord, joinFeaturesOpt) =>
|
||||
joinFeaturesOpt.foreach { joinRecord => dataRecordMerger.merge(inputRecord, joinRecord) }
|
||||
inputRecord
|
||||
}
|
||||
|
||||
DataSetPipe(
|
||||
joinedRecords,
|
||||
FeatureContext.merge(inputDataSet.featureContext, joinFeaturesDataSet.featureContext)
|
||||
)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
|
||||
|
||||
import com.twitter.ml.api._
|
||||
import com.twitter.ml.api.FeatureContext
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
/*
|
||||
* A really bad default merge policy that picks all the aggregate
|
||||
* features corresponding to the first sparse key value in the list.
|
||||
* Does not rename any of the aggregate features for simplicity.
|
||||
* Avoid using this merge policy if at all possible.
|
||||
*/
|
||||
object PickFirstRecordPolicy extends SparseBinaryMergePolicy {
|
||||
val dataRecordMerger: DataRecordMerger = new DataRecordMerger
|
||||
|
||||
override def mergeRecord(
|
||||
mutableInputRecord: DataRecord,
|
||||
aggregateRecords: List[DataRecord],
|
||||
aggregateContext: FeatureContext
|
||||
): Unit =
|
||||
aggregateRecords.headOption
|
||||
.foreach(aggregateRecord => dataRecordMerger.merge(mutableInputRecord, aggregateRecord))
|
||||
|
||||
override def aggregateFeaturesPostMerge(aggregateContext: FeatureContext): Set[Feature[_]] =
|
||||
aggregateContext.getAllFeatures.asScala.toSet
|
||||
}
|
|
@ -0,0 +1,226 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
|
||||
|
||||
import com.twitter.ml.api._
|
||||
import com.twitter.ml.api.FeatureContext
|
||||
import com.twitter.ml.api.util.SRichDataRecord
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon
|
||||
import java.lang.{Boolean => JBoolean}
|
||||
import java.lang.{Double => JDouble}
|
||||
|
||||
case class CtrDescriptor(
|
||||
engagementFeature: Feature[JDouble],
|
||||
impressionFeature: Feature[JDouble],
|
||||
outputFeature: Feature[JDouble])
|
||||
|
||||
object PickTopCtrBuilderHelper {
|
||||
|
||||
def createCtrDescriptors(
|
||||
aggregatePrefix: String,
|
||||
engagementLabels: Set[Feature[JBoolean]],
|
||||
aggregatesToCompute: Set[TypedAggregateGroup[_]],
|
||||
outputSuffix: String
|
||||
): Set[CtrDescriptor] = {
|
||||
val aggregateFeatures = aggregatesToCompute
|
||||
.filter(_.aggregatePrefix == aggregatePrefix)
|
||||
|
||||
val impressionFeature = aggregateFeatures
|
||||
.flatMap { group =>
|
||||
group.individualAggregateDescriptors
|
||||
.filter(_.query.feature == None)
|
||||
.filter(_.query.label == None)
|
||||
.flatMap(_.outputFeatures)
|
||||
}
|
||||
.head
|
||||
.asInstanceOf[Feature[JDouble]]
|
||||
|
||||
val aggregateEngagementFeatures =
|
||||
aggregateFeatures
|
||||
.flatMap { group =>
|
||||
group.individualAggregateDescriptors
|
||||
.filter(_.query.feature == None)
|
||||
.filter { descriptor =>
|
||||
//TODO: we should remove the need to pass around engagementLabels and just use all the labels available.
|
||||
descriptor.query.label.exists(engagementLabels.contains(_))
|
||||
}
|
||||
.flatMap(_.outputFeatures)
|
||||
}
|
||||
.map(_.asInstanceOf[Feature[JDouble]])
|
||||
|
||||
aggregateEngagementFeatures
|
||||
.map { aggregateEngagementFeature =>
|
||||
CtrDescriptor(
|
||||
engagementFeature = aggregateEngagementFeature,
|
||||
impressionFeature = impressionFeature,
|
||||
outputFeature = new Feature.Continuous(
|
||||
aggregateEngagementFeature.getDenseFeatureName + "." + outputSuffix,
|
||||
AggregationMetricCommon.derivePersonalDataTypes(
|
||||
Some(aggregateEngagementFeature),
|
||||
Some(impressionFeature)
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
object PickTopCtrPolicy {
|
||||
def build(
|
||||
aggregatePrefix: String,
|
||||
engagementLabels: Set[Feature[JBoolean]],
|
||||
aggregatesToCompute: Set[TypedAggregateGroup[_]],
|
||||
smoothing: Double = 1.0,
|
||||
outputSuffix: String = "ratio"
|
||||
): PickTopCtrPolicy = {
|
||||
val ctrDescriptors = PickTopCtrBuilderHelper.createCtrDescriptors(
|
||||
aggregatePrefix = aggregatePrefix,
|
||||
engagementLabels = engagementLabels,
|
||||
aggregatesToCompute = aggregatesToCompute,
|
||||
outputSuffix = outputSuffix
|
||||
)
|
||||
PickTopCtrPolicy(
|
||||
ctrDescriptors = ctrDescriptors,
|
||||
smoothing = smoothing
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
object CombinedTopNCtrsByWilsonConfidenceIntervalPolicy {
|
||||
def build(
|
||||
aggregatePrefix: String,
|
||||
engagementLabels: Set[Feature[JBoolean]],
|
||||
aggregatesToCompute: Set[TypedAggregateGroup[_]],
|
||||
outputSuffix: String = "ratioWithWCI",
|
||||
z: Double = 1.96,
|
||||
topN: Int = 1
|
||||
): CombinedTopNCtrsByWilsonConfidenceIntervalPolicy = {
|
||||
val ctrDescriptors = PickTopCtrBuilderHelper.createCtrDescriptors(
|
||||
aggregatePrefix = aggregatePrefix,
|
||||
engagementLabels = engagementLabels,
|
||||
aggregatesToCompute = aggregatesToCompute,
|
||||
outputSuffix = outputSuffix
|
||||
)
|
||||
CombinedTopNCtrsByWilsonConfidenceIntervalPolicy(
|
||||
ctrDescriptors = ctrDescriptors,
|
||||
z = z,
|
||||
topN = topN
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* A merge policy that picks the aggregate features corresponding to
|
||||
* the sparse key value with the highest engagement rate (defined
|
||||
* as the ratio of two specified features, representing engagements
|
||||
* and impressions). Also outputs the engagement rate to the specified
|
||||
* outputFeature.
|
||||
*
|
||||
* This is an abstract class. We can make variants of this policy by overriding
|
||||
* the calculateCtr method.
|
||||
*/
|
||||
|
||||
abstract class PickTopCtrPolicyBase(ctrDescriptors: Set[CtrDescriptor])
|
||||
extends SparseBinaryMergePolicy {
|
||||
|
||||
private def getContinuousFeature(
|
||||
aggregateRecord: DataRecord,
|
||||
feature: Feature[JDouble]
|
||||
): Double = {
|
||||
Option(SRichDataRecord(aggregateRecord).getFeatureValue(feature))
|
||||
.map(_.asInstanceOf[JDouble].toDouble)
|
||||
.getOrElse(0.0)
|
||||
}
|
||||
|
||||
/**
|
||||
* For every provided descriptor, compute the corresponding CTR feature
|
||||
* and only hydrate this result to the provided input record.
|
||||
*/
|
||||
override def mergeRecord(
|
||||
mutableInputRecord: DataRecord,
|
||||
aggregateRecords: List[DataRecord],
|
||||
aggregateContext: FeatureContext
|
||||
): Unit = {
|
||||
ctrDescriptors
|
||||
.foreach {
|
||||
case CtrDescriptor(engagementFeature, impressionFeature, outputFeature) =>
|
||||
val sortedCtrs =
|
||||
aggregateRecords
|
||||
.map { aggregateRecord =>
|
||||
val impressions = getContinuousFeature(aggregateRecord, impressionFeature)
|
||||
val engagements = getContinuousFeature(aggregateRecord, engagementFeature)
|
||||
calculateCtr(impressions, engagements)
|
||||
}
|
||||
.sortBy { ctr => -ctr }
|
||||
combineTopNCtrsToSingleScore(sortedCtrs)
|
||||
.foreach { score =>
|
||||
SRichDataRecord(mutableInputRecord).setFeatureValue(outputFeature, score)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected def calculateCtr(impressions: Double, engagements: Double): Double
|
||||
|
||||
protected def combineTopNCtrsToSingleScore(sortedCtrs: Seq[Double]): Option[Double]
|
||||
|
||||
override def aggregateFeaturesPostMerge(aggregateContext: FeatureContext): Set[Feature[_]] =
|
||||
ctrDescriptors
|
||||
.map(_.outputFeature)
|
||||
.toSet
|
||||
}
|
||||
|
||||
case class PickTopCtrPolicy(ctrDescriptors: Set[CtrDescriptor], smoothing: Double = 1.0)
|
||||
extends PickTopCtrPolicyBase(ctrDescriptors) {
|
||||
require(smoothing > 0.0)
|
||||
|
||||
override def calculateCtr(impressions: Double, engagements: Double): Double =
|
||||
(1.0 * engagements) / (smoothing + impressions)
|
||||
|
||||
override def combineTopNCtrsToSingleScore(sortedCtrs: Seq[Double]): Option[Double] =
|
||||
sortedCtrs.headOption
|
||||
}
|
||||
|
||||
case class CombinedTopNCtrsByWilsonConfidenceIntervalPolicy(
|
||||
ctrDescriptors: Set[CtrDescriptor],
|
||||
z: Double = 1.96,
|
||||
topN: Int = 1)
|
||||
extends PickTopCtrPolicyBase(ctrDescriptors) {
|
||||
|
||||
private val zSquared = z * z
|
||||
private val zSquaredDiv2 = zSquared / 2.0
|
||||
private val zSquaredDiv4 = zSquared / 4.0
|
||||
|
||||
/**
|
||||
* calculates the lower bound of wilson score interval. which roughly says "the actual engagement
|
||||
* rate is at least this value" with confidence designated by the z-score:
|
||||
* https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval
|
||||
*/
|
||||
override def calculateCtr(rawImpressions: Double, engagements: Double): Double = {
|
||||
// just in case engagements happens to be more than impressions...
|
||||
val impressions = Math.max(rawImpressions, engagements)
|
||||
|
||||
if (impressions > 0.0) {
|
||||
val p = engagements / impressions
|
||||
(p
|
||||
+ zSquaredDiv2 / impressions
|
||||
- z * Math.sqrt(
|
||||
(p * (1.0 - p) + zSquaredDiv4 / impressions) / impressions)) / (1.0 + zSquared / impressions)
|
||||
|
||||
} else 0.0
|
||||
}
|
||||
|
||||
/**
|
||||
* takes the topN engagement rates, and returns the joint probability as {1.0 - Π(1.0 - p)}
|
||||
*
|
||||
* e.g. let's say you have 0.6 chance of clicking on a tweet shared by the user A.
|
||||
* you also have 0.3 chance of clicking on a tweet shared by the user B.
|
||||
* seeing a tweet shared by both A and B will not lead to 0.9 chance of you clicking on it.
|
||||
* but you could say that you have 0.4*0.7 chance of NOT clicking on that tweet.
|
||||
*/
|
||||
override def combineTopNCtrsToSingleScore(sortedCtrs: Seq[Double]): Option[Double] =
|
||||
if (sortedCtrs.nonEmpty) {
|
||||
val inverseLogP = sortedCtrs
|
||||
.take(topN).map { p => Math.log(1.0 - p) }.sum
|
||||
Some(1.0 - Math.exp(inverseLogP))
|
||||
} else None
|
||||
|
||||
}
|
|
@ -0,0 +1,199 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
|
||||
|
||||
import com.twitter.ml.api._
|
||||
import com.twitter.ml.api.Feature
|
||||
import com.twitter.ml.api.util.SRichDataRecord
|
||||
import com.twitter.scalding.typed.TypedPipe
|
||||
import com.twitter.scalding.typed.UnsortedGrouped
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
|
||||
import java.util.{Set => JSet}
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object SparseBinaryAggregateJoin {
|
||||
import TypedAggregateGroup._
|
||||
|
||||
def makeKey(record: DataRecord, joinKeyList: List[Feature[_]]): String = {
|
||||
joinKeyList.map {
|
||||
case sparseKey: Feature.SparseBinary =>
|
||||
SRichDataRecord(record).getFeatureValue(sparseFeature(sparseKey))
|
||||
case nonSparseKey: Feature[_] =>
|
||||
SRichDataRecord(record).getFeatureValue(nonSparseKey)
|
||||
}.toString
|
||||
}
|
||||
|
||||
/**
|
||||
* @param record Data record to get all possible sparse aggregate keys from
|
||||
* @param List of join key features (some can be sparse and some non-sparse)
|
||||
* @return A list of string keys to use for joining
|
||||
*/
|
||||
def makeKeyPermutations(record: DataRecord, joinKeyList: List[Feature[_]]): List[String] = {
|
||||
val allIdValues = joinKeyList.flatMap {
|
||||
case sparseKey: Feature.SparseBinary => {
|
||||
val id = sparseKey.getDenseFeatureId
|
||||
val valuesOpt = Option(SRichDataRecord(record).getFeatureValue(sparseKey))
|
||||
.map(_.asInstanceOf[JSet[String]].asScala.toSet)
|
||||
valuesOpt.map { (id, _) }
|
||||
}
|
||||
case nonSparseKey: Feature[_] => {
|
||||
val id = nonSparseKey.getDenseFeatureId
|
||||
Option(SRichDataRecord(record).getFeatureValue(nonSparseKey)).map { value =>
|
||||
(id, Set(value.toString))
|
||||
}
|
||||
}
|
||||
}
|
||||
sparseBinaryPermutations(allIdValues).toList.map { idValues =>
|
||||
joinKeyList.map { key => idValues.getOrElse(key.getDenseFeatureId, "") }.toString
|
||||
}
|
||||
}
|
||||
|
||||
private[this] def mkKeyIndexedAggregates(
|
||||
joinFeaturesDataSet: DataSetPipe,
|
||||
joinKeyList: List[Feature[_]]
|
||||
): TypedPipe[(String, DataRecord)] =
|
||||
joinFeaturesDataSet.records
|
||||
.map { record => (makeKey(record, joinKeyList), record) }
|
||||
|
||||
private[this] def mkKeyIndexedInput(
|
||||
inputDataSet: DataSetPipe,
|
||||
joinKeyList: List[Feature[_]]
|
||||
): TypedPipe[(String, DataRecord)] =
|
||||
inputDataSet.records
|
||||
.flatMap { record =>
|
||||
for {
|
||||
key <- makeKeyPermutations(record, joinKeyList)
|
||||
} yield { (key, record) }
|
||||
}
|
||||
|
||||
private[this] def mkKeyIndexedInputWithUniqueId(
|
||||
inputDataSet: DataSetPipe,
|
||||
joinKeyList: List[Feature[_]],
|
||||
uniqueIdFeatureList: List[Feature[_]]
|
||||
): TypedPipe[(String, String)] =
|
||||
inputDataSet.records
|
||||
.flatMap { record =>
|
||||
for {
|
||||
key <- makeKeyPermutations(record, joinKeyList)
|
||||
} yield { (key, makeKey(record, uniqueIdFeatureList)) }
|
||||
}
|
||||
|
||||
private[this] def mkRecordIndexedAggregates(
|
||||
keyIndexedInput: TypedPipe[(String, DataRecord)],
|
||||
keyIndexedAggregates: TypedPipe[(String, DataRecord)]
|
||||
): UnsortedGrouped[DataRecord, List[DataRecord]] =
|
||||
keyIndexedInput
|
||||
.join(keyIndexedAggregates)
|
||||
.map { case (_, (inputRecord, aggregateRecord)) => (inputRecord, aggregateRecord) }
|
||||
.group
|
||||
.toList
|
||||
|
||||
private[this] def mkRecordIndexedAggregatesWithUniqueId(
|
||||
keyIndexedInput: TypedPipe[(String, String)],
|
||||
keyIndexedAggregates: TypedPipe[(String, DataRecord)]
|
||||
): UnsortedGrouped[String, List[DataRecord]] =
|
||||
keyIndexedInput
|
||||
.join(keyIndexedAggregates)
|
||||
.map { case (_, (inputId, aggregateRecord)) => (inputId, aggregateRecord) }
|
||||
.group
|
||||
.toList
|
||||
|
||||
def mkJoinedDataSet(
|
||||
inputDataSet: DataSetPipe,
|
||||
joinFeaturesDataSet: DataSetPipe,
|
||||
recordIndexedAggregates: UnsortedGrouped[DataRecord, List[DataRecord]],
|
||||
mergePolicy: SparseBinaryMergePolicy
|
||||
): TypedPipe[DataRecord] =
|
||||
inputDataSet.records
|
||||
.map(record => (record, ()))
|
||||
.leftJoin(recordIndexedAggregates)
|
||||
.map {
|
||||
case (inputRecord, (_, aggregateRecordsOpt)) =>
|
||||
aggregateRecordsOpt
|
||||
.map { aggregateRecords =>
|
||||
mergePolicy.mergeRecord(
|
||||
inputRecord,
|
||||
aggregateRecords,
|
||||
joinFeaturesDataSet.featureContext
|
||||
)
|
||||
inputRecord
|
||||
}
|
||||
.getOrElse(inputRecord)
|
||||
}
|
||||
|
||||
def mkJoinedDataSetWithUniqueId(
|
||||
inputDataSet: DataSetPipe,
|
||||
joinFeaturesDataSet: DataSetPipe,
|
||||
recordIndexedAggregates: UnsortedGrouped[String, List[DataRecord]],
|
||||
mergePolicy: SparseBinaryMergePolicy,
|
||||
uniqueIdFeatureList: List[Feature[_]]
|
||||
): TypedPipe[DataRecord] =
|
||||
inputDataSet.records
|
||||
.map(record => (makeKey(record, uniqueIdFeatureList), record))
|
||||
.leftJoin(recordIndexedAggregates)
|
||||
.map {
|
||||
case (_, (inputRecord, aggregateRecordsOpt)) =>
|
||||
aggregateRecordsOpt
|
||||
.map { aggregateRecords =>
|
||||
mergePolicy.mergeRecord(
|
||||
inputRecord,
|
||||
aggregateRecords,
|
||||
joinFeaturesDataSet.featureContext
|
||||
)
|
||||
inputRecord
|
||||
}
|
||||
.getOrElse(inputRecord)
|
||||
}
|
||||
|
||||
/**
|
||||
* If uniqueIdFeatures is non-empty and the join keys include a sparse binary
|
||||
* key, the join will use this set of keys as a unique id to reduce
|
||||
* memory consumption. You should need this option only for
|
||||
* memory-intensive joins to avoid OOM errors.
|
||||
*/
|
||||
def apply(
|
||||
inputDataSet: DataSetPipe,
|
||||
joinKeys: Product,
|
||||
joinFeaturesDataSet: DataSetPipe,
|
||||
mergePolicy: SparseBinaryMergePolicy = PickFirstRecordPolicy,
|
||||
uniqueIdFeaturesOpt: Option[Product] = None
|
||||
): DataSetPipe = {
|
||||
val joinKeyList = joinKeys.productIterator.toList.asInstanceOf[List[Feature[_]]]
|
||||
val sparseBinaryJoinKeySet =
|
||||
joinKeyList.toSet.filter(_.getFeatureType() == FeatureType.SPARSE_BINARY)
|
||||
val containsSparseBinaryKey = !sparseBinaryJoinKeySet.isEmpty
|
||||
if (containsSparseBinaryKey) {
|
||||
val uniqueIdFeatureList = uniqueIdFeaturesOpt
|
||||
.map(uniqueIdFeatures =>
|
||||
uniqueIdFeatures.productIterator.toList.asInstanceOf[List[Feature[_]]])
|
||||
.getOrElse(List.empty[Feature[_]])
|
||||
val keyIndexedAggregates = mkKeyIndexedAggregates(joinFeaturesDataSet, joinKeyList)
|
||||
val joinedDataSet = if (uniqueIdFeatureList.isEmpty) {
|
||||
val keyIndexedInput = mkKeyIndexedInput(inputDataSet, joinKeyList)
|
||||
val recordIndexedAggregates =
|
||||
mkRecordIndexedAggregates(keyIndexedInput, keyIndexedAggregates)
|
||||
mkJoinedDataSet(inputDataSet, joinFeaturesDataSet, recordIndexedAggregates, mergePolicy)
|
||||
} else {
|
||||
val keyIndexedInput =
|
||||
mkKeyIndexedInputWithUniqueId(inputDataSet, joinKeyList, uniqueIdFeatureList)
|
||||
val recordIndexedAggregates =
|
||||
mkRecordIndexedAggregatesWithUniqueId(keyIndexedInput, keyIndexedAggregates)
|
||||
mkJoinedDataSetWithUniqueId(
|
||||
inputDataSet,
|
||||
joinFeaturesDataSet,
|
||||
recordIndexedAggregates,
|
||||
mergePolicy,
|
||||
uniqueIdFeatureList
|
||||
)
|
||||
}
|
||||
|
||||
DataSetPipe(
|
||||
joinedDataSet,
|
||||
mergePolicy.mergeContext(
|
||||
inputDataSet.featureContext,
|
||||
joinFeaturesDataSet.featureContext
|
||||
)
|
||||
)
|
||||
} else {
|
||||
inputDataSet.joinWithSmaller(joinKeys, joinFeaturesDataSet) { _.pass }
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
|
||||
|
||||
import com.twitter.ml.api._
|
||||
import com.twitter.ml.api.FeatureContext
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
/**
|
||||
* When using the aggregates framework to group by sparse binary keys,
|
||||
* we generate different aggregate feature values for each possible
|
||||
* value of the sparse key. Hence, when joining back the aggregate
|
||||
* features with a training data set, each individual training record
|
||||
* has multiple aggregate features to choose from, for each value taken
|
||||
* by the sparse key(s) in the training record. The merge policy trait
|
||||
* below specifies how to condense/combine this variable number of
|
||||
* aggregate features into a constant number of features for training.
|
||||
* Some simple policies might be: pick the first feature set (randomly),
|
||||
* pick the top sorted by some attribute, or take some average.
|
||||
*
|
||||
* Example: suppose we group by (ADVERTISER_ID, INTEREST_ID) where INTEREST_ID
|
||||
* is the sparse key, and compute a "CTR" aggregate feature for each such
|
||||
* pair measuring the click through rate on ads with (ADVERTISER_ID, INTEREST_ID).
|
||||
* Say we have the following aggregate records:
|
||||
*
|
||||
* (ADVERTISER_ID = 1, INTEREST_ID = 1, CTR = 5%)
|
||||
* (ADVERTISER_ID = 1, INTEREST_ID = 2, CTR = 15%)
|
||||
* (ADVERTISER_ID = 2, INTEREST_ID = 1, CTR = 1%)
|
||||
* (ADVERTISER_ID = 2, INTEREST_ID = 2, CTR = 10%)
|
||||
* ...
|
||||
* At training time, each training record has one value for ADVERTISER_ID, but it
|
||||
* has multiple values for INTEREST_ID e.g.
|
||||
*
|
||||
* (ADVERTISER_ID = 1, INTEREST_IDS = (1,2))
|
||||
*
|
||||
* There are multiple potential CTRs we can get when joining in the aggregate features:
|
||||
* in this case 2 values (5% and 15%) but in general it could be many depending on how
|
||||
* many interests the user has. When joining back the CTR features, the merge policy says how to
|
||||
* combine all these CTRs to engineer features.
|
||||
*
|
||||
* "Pick first" would say - pick some random CTR (whatever is first in the list, maybe 5%)
|
||||
* for training (probably not a good policy). "Sort by CTR" could be a policy
|
||||
* that just picks the top CTR and uses it as a feature (here 15%). Similarly, you could
|
||||
* imagine "Top K sorted by CTR" (use both 5 and 15%) or "Avg CTR" (10%) or other policies,
|
||||
* all of which are defined as objects/case classes that override this trait.
|
||||
*/
|
||||
trait SparseBinaryMergePolicy {
|
||||
|
||||
/**
|
||||
* @param mutableInputRecord Input record to add aggregates to
|
||||
* @param aggregateRecords Aggregate feature records
|
||||
* @param aggregateContext Context for aggregate records
|
||||
*/
|
||||
def mergeRecord(
|
||||
mutableInputRecord: DataRecord,
|
||||
aggregateRecords: List[DataRecord],
|
||||
aggregateContext: FeatureContext
|
||||
): Unit
|
||||
|
||||
def aggregateFeaturesPostMerge(aggregateContext: FeatureContext): Set[Feature[_]]
|
||||
|
||||
/**
|
||||
* @param inputContext Context for input record
|
||||
* @param aggregateContext Context for aggregate records
|
||||
* @return Context for record returned by mergeRecord()
|
||||
*/
|
||||
def mergeContext(
|
||||
inputContext: FeatureContext,
|
||||
aggregateContext: FeatureContext
|
||||
): FeatureContext = new FeatureContext(
|
||||
(inputContext.getAllFeatures.asScala.toSet ++ aggregateFeaturesPostMerge(
|
||||
aggregateContext)).toSeq.asJava
|
||||
)
|
||||
|
||||
def allOutputFeaturesPostMergePolicy[T](config: TypedAggregateGroup[T]): Set[Feature[_]] = {
|
||||
val containsSparseBinary = config.keysToAggregate
|
||||
.exists(_.getFeatureType == FeatureType.SPARSE_BINARY)
|
||||
|
||||
if (!containsSparseBinary) config.allOutputFeatures
|
||||
else aggregateFeaturesPostMerge(new FeatureContext(config.allOutputFeatures.toSeq.asJava))
|
||||
}
|
||||
}
|
|
@ -0,0 +1,109 @@
|
|||
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
|
||||
|
||||
import com.twitter.bijection.Injection
|
||||
import com.twitter.ml.api._
|
||||
import com.twitter.ml.api.Feature
|
||||
import com.twitter.ml.api.util.SRichDataRecord
|
||||
import com.twitter.scalding.typed.TypedPipe
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup.sparseFeature
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
case class SparseJoinConfig(
|
||||
aggregates: DataSetPipe,
|
||||
sparseKey: Feature.SparseBinary,
|
||||
mergePolicies: SparseBinaryMergePolicy*)
|
||||
|
||||
object SparseBinaryMultipleAggregateJoin {
|
||||
type CommonMap = (String, ((Feature.SparseBinary, String), DataRecord))
|
||||
|
||||
def apply(
|
||||
source: DataSetPipe,
|
||||
commonKey: Feature[_],
|
||||
joinConfigs: Set[SparseJoinConfig],
|
||||
rightJoin: Boolean = false,
|
||||
isSketchJoin: Boolean = false,
|
||||
numSketchJoinReducers: Int = 0
|
||||
): DataSetPipe = {
|
||||
val emptyPipe: TypedPipe[CommonMap] = TypedPipe.empty
|
||||
val aggregateMaps: Set[TypedPipe[CommonMap]] = joinConfigs.map { joinConfig =>
|
||||
joinConfig.aggregates.records.map { record =>
|
||||
val sparseKeyValue =
|
||||
SRichDataRecord(record).getFeatureValue(sparseFeature(joinConfig.sparseKey)).toString
|
||||
val commonKeyValue = SRichDataRecord(record).getFeatureValue(commonKey).toString
|
||||
(commonKeyValue, ((joinConfig.sparseKey, sparseKeyValue), record))
|
||||
}
|
||||
}
|
||||
|
||||
val commonKeyToAggregateMap = aggregateMaps
|
||||
.foldLeft(emptyPipe) {
|
||||
case (union: TypedPipe[CommonMap], next: TypedPipe[CommonMap]) =>
|
||||
union ++ next
|
||||
}
|
||||
.group
|
||||
.toList
|
||||
.map {
|
||||
case (commonKeyValue, aggregateTuples) =>
|
||||
(commonKeyValue, aggregateTuples.toMap)
|
||||
}
|
||||
|
||||
val commonKeyToRecordMap = source.records
|
||||
.map { record =>
|
||||
val commonKeyValue = SRichDataRecord(record).getFeatureValue(commonKey).toString
|
||||
(commonKeyValue, record)
|
||||
}
|
||||
|
||||
// rightJoin is not supported by Sketched, so rightJoin will be ignored if isSketchJoin is set
|
||||
implicit val string2Byte = (value: String) => Injection[String, Array[Byte]](value)
|
||||
val intermediateRecords = if (isSketchJoin) {
|
||||
commonKeyToRecordMap.group
|
||||
.sketch(numSketchJoinReducers)
|
||||
.leftJoin(commonKeyToAggregateMap)
|
||||
.toTypedPipe
|
||||
} else if (rightJoin) {
|
||||
commonKeyToAggregateMap
|
||||
.rightJoin(commonKeyToRecordMap)
|
||||
.mapValues(_.swap)
|
||||
.toTypedPipe
|
||||
} else {
|
||||
commonKeyToRecordMap.leftJoin(commonKeyToAggregateMap).toTypedPipe
|
||||
}
|
||||
|
||||
val joinedRecords = intermediateRecords
|
||||
.map {
|
||||
case (commonKeyValue, (inputRecord, aggregateTupleMapOpt)) =>
|
||||
aggregateTupleMapOpt.foreach { aggregateTupleMap =>
|
||||
joinConfigs.foreach { joinConfig =>
|
||||
val sparseKeyValues = Option(
|
||||
SRichDataRecord(inputRecord)
|
||||
.getFeatureValue(joinConfig.sparseKey)
|
||||
).map(_.asScala.toList)
|
||||
.getOrElse(List.empty[String])
|
||||
|
||||
val aggregateRecords = sparseKeyValues.flatMap { sparseKeyValue =>
|
||||
aggregateTupleMap.get((joinConfig.sparseKey, sparseKeyValue))
|
||||
}
|
||||
|
||||
joinConfig.mergePolicies.foreach { mergePolicy =>
|
||||
mergePolicy.mergeRecord(
|
||||
inputRecord,
|
||||
aggregateRecords,
|
||||
joinConfig.aggregates.featureContext
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
inputRecord
|
||||
}
|
||||
|
||||
val joinedFeatureContext = joinConfigs
|
||||
.foldLeft(source.featureContext) {
|
||||
case (left, joinConfig) =>
|
||||
joinConfig.mergePolicies.foldLeft(left) {
|
||||
case (soFar, mergePolicy) =>
|
||||
mergePolicy.mergeContext(soFar, joinConfig.aggregates.featureContext)
|
||||
}
|
||||
}
|
||||
|
||||
DataSetPipe(joinedRecords, joinedFeatureContext)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
aggregation.rst
|
||||
batch.rst
|
||||
index.rst
|
||||
real-time.rst
|
||||
troubleshooting.rst
|
|
@ -0,0 +1,167 @@
|
|||
.. _aggregation:
|
||||
|
||||
Core Concepts
|
||||
=============
|
||||
|
||||
This page provides an overview of the aggregation framework and goes through examples on how to define aggregate features. In general, we can think of an aggregate feature as a grouped set of records, on which we incrementally update the aggregate feature values, crossed by the provided features and conditional on the provided labels.
|
||||
|
||||
AggregateGroup
|
||||
--------------
|
||||
|
||||
An `AggregateGroup` defines a single unit of aggregate computation, similar to a SQL query. These are executed by the underlying jobs (internally, a `DataRecordAggregationMonoid <https://cgit.twitter.biz/source/tree/timelines/data_processing/ml_util/aggregation_framework/DataRecordAggregationMonoid.scala#n42>`_ is applied to `DataRecords` that contain the features to aggregate). Many of these groups can exist to define different types of aggregate features.
|
||||
|
||||
Let's start with the following examples of an `AggregateGroup` to discuss the meaning of each of its constructor arguments:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
val UserAggregateStore = "user_aggregates"
|
||||
val aggregatesToCompute: Set[TypedAggregateGroup[_]] = Set(
|
||||
AggregateGroup(
|
||||
inputSource = timelinesDailyRecapSource,
|
||||
aggregatePrefix = "user_aggregate_v2",
|
||||
preTransformOpt = Some(RemoveUserIdZero),
|
||||
keys = Set(USER_ID),
|
||||
features = Set(HAS_PHOTO),
|
||||
labels = Set(IS_FAVORITED),
|
||||
metrics = Set(CountMetric, SumMetric),
|
||||
halfLives = Set(50.days),
|
||||
outputStore = OfflineAggregateStore(
|
||||
name = UserAggregateStore,
|
||||
startDate = "2016-07-15 00:00",
|
||||
commonConfig = timelinesDailyAggregateSink,
|
||||
batchesToKeep = 5
|
||||
)
|
||||
)
|
||||
.flatMap(_.buildTypedAggregateGroups)
|
||||
)
|
||||
|
||||
This `AggregateGroup` computes the number of times each user has faved a tweet with a photo. The aggregate count is decayed with a 50 day halflife.
|
||||
|
||||
Naming and preprocessing
|
||||
------------------------
|
||||
|
||||
`UserAggregateStore` is a string val that acts as a scope of a "root path" to which this group of aggregate features will be written. The root path is provided separately by the implementing job.
|
||||
|
||||
`inputSource` defines the input source of `DataRecords` that we aggregate on. These records contain the relevant features required for aggregation.
|
||||
|
||||
`aggregatePrefix` tells the framework what prefix to use for the aggregate features it generates. A descriptive naming scheme with versioning makes it easier to maintain features as you add or remove them over the long-term.
|
||||
|
||||
`preTransforms` is a `Seq[com.twitter.ml.api.ITransform] <https://cgit.twitter.biz/source/tree/src/java/com/twitter/ml/api/ITransform.java>`_ that can be applied to the data records read from the input source before they are fed into the `AggregateGroup` to apply aggregation. These transforms are optional but can be useful for certain preprocessing operations for a group's raw input features.
|
||||
|
||||
.. admonition:: Examples
|
||||
|
||||
You can downsample input data records by providing `preTransforms`. In addition, you could also join different input labels (e.g. "is_push_openend" and "is_push_favorited") and transform them into a combined label that is their union ("is_push_engaged") on which aggregate counts will be calculated.
|
||||
|
||||
|
||||
Keys
|
||||
----
|
||||
|
||||
`keys` is a crucial field in the config. It defines a `Set[com.twitter.ml.api.Feature]` which specifies a set of grouping keys to use for this `AggregateGroup`.
|
||||
|
||||
Keys can only be of 3 supported types currently: `DISCRETE`, `STRING` and `SPARSE_BINARY`. Using a discrete or a string/text feature as a key specifies the unit to group records by before applying counting/aggregation operators.
|
||||
|
||||
|
||||
.. admonition:: Examples
|
||||
|
||||
.. cssclass:: shortlist
|
||||
|
||||
#. If the key is `USER_ID`, this tells the framework to group all records by `USER_ID`, and then apply aggregations (sum/count/etc) within each user’s data to generate aggregate features for each user.
|
||||
|
||||
#. If the key is `(USER_ID, AUTHOR_ID)`, then the `AggregateGroup` will output features for each unique user-author pair in the input data.
|
||||
|
||||
#. Finally, using a sparse binary feature as key has special "flattening" or "flatMap" like semantics. For example, consider grouping by `(USER_ID, AUTHOR_INTEREST_IDS)` where `AUTHOR_INTEREST_IDS` is a sparse binary feature which represents a set of topic IDs the author may be tweeting about. This creates one record for each `(user_id, interest_id)` pair - so each record with multiple author interests is flattened before feeding it to the aggregation.
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
`features` specifies a `Set[com.twitter.ml.api.Feature]` to aggregate within each group (defined by the keys specified earlier).
|
||||
|
||||
We support 2 types of `features`: `BINARY` and `CONTINUOUS`.
|
||||
|
||||
The semantics of how the aggregation works is slightly different based on the type of “feature”, and based on the “metric” (or aggregation operation):
|
||||
|
||||
.. cssclass:: shortlist
|
||||
|
||||
#. Binary Feature, Count Metric: Suppose we have a binary feature `HAS_PHOTO` in this set, and are applying the “Count” metric (see below for more details on the metrics), with key `USER_ID`. The semantics is that this computes a feature which measures the count of records with `HAS_PHOTO` set to true for each user.
|
||||
|
||||
#. Binary Feature, Sum Metric - Does not apply. No feature will be computed.
|
||||
|
||||
#. Continuous Feature, Count Metric - The count metric treats all features as binary features ignoring their value. For example, suppose we have a continuous feature `NUM_CHARACTERS_IN_TWEET`, and key `USER_ID`. This measures the count of records that have this feature `NUM_CHARACTERS_IN_TWEET` present.
|
||||
|
||||
#. Continuous Feature, Sum Metric - In the above example, the features measures the sum of (num_characters_in_tweet) over all a user’s records. Dividing this sum feature by the count feature would give the average number of characters in all tweets.
|
||||
|
||||
.. admonition:: Unsupported feature types
|
||||
|
||||
`DISCRETE` and `SPARSE` features are not supported by the Sum Metric, because there is no meaning in summing a discrete feature or a sparse feature. You can use them with the CountMetric, but they may not do what you would expect since they will be treated as binary features losing all the information within the feature. The best way to use these is as “keys” and not as “features”.
|
||||
|
||||
.. admonition:: Setting includeAnyFeature
|
||||
|
||||
If constructor argument `includeAnyFeature` is set, the framework will append a feature with scope `any_feature` to the set of all features you define. This additional feature simply measures the total count of records. So if you set your features to be equal to Set.empty, this will measure the count of records for a given `USER_ID`.
|
||||
|
||||
Labels
|
||||
------
|
||||
|
||||
`labels` specifies a set of `BINARY` features that you can cross with, prior to applying aggregations on the `features`. This essentially restricts the aggregate computation to a subset of the records within a particular key.
|
||||
|
||||
We typically use this to represent engagement labels in an ML model, in this case, `IS_FAVORITED`.
|
||||
|
||||
In this example, we are grouping by `USER_ID`, the feature is `HAS_PHOTO`, the label is `IS_FAVORITED`, and we are computing `CountMetric`. The system will output a feature for each user that represents the number of favorites on tweets having photos by this `userId`.
|
||||
|
||||
.. admonition:: Setting includeAnyLabel
|
||||
|
||||
If constructor argument `includeAnyLabel` is set (as it is by default), then similar to `any_feature`, the framework automatically appends a label of type `any_label` to the set of all labels you define, which represents not applying any filter or cross.
|
||||
|
||||
In this example, `any_label` and `any_feature` are set by default and the system would actually output 4 features for each `user_id`:
|
||||
|
||||
.. cssclass:: shortlist
|
||||
|
||||
#. The number of `IS_FAVORITED` (favorites) on tweet impressions having `HAS_PHOTO=true`
|
||||
|
||||
#. The number of `IS_FAVORITED` (favorites) on all tweet impressions (`any_feature` aggregate)
|
||||
|
||||
#. The number of tweet impressions having `HAS_PHOTO=true` (`any_label` aggregate)
|
||||
|
||||
#. The total number of tweet impressions for this user id (`any_feature.any_label` aggregate)
|
||||
|
||||
.. admonition:: Disabling includeAnyLabel
|
||||
|
||||
To disable this automatically generated feature you can use `includeAnyLabel = false` in your config. This will remove some useful features (particularly for counterfactual signal), but it can greatly save on space since it does not store every possible impressed set of keys in the output store. So use this if you are short on space, but not otherwise.
|
||||
|
||||
Metrics
|
||||
-------
|
||||
|
||||
`metrics` specifies the aggregate operators to apply. The most commonly used are `Count`, `Sum` and `SumSq`.
|
||||
|
||||
As mentioned before, `Count` can be applied to all types of features, but treats every feature as binary and ignores the value of the feature. `Sum` and `SumSq` can only be applied to Continuous features - they will ignore all other features you specify. By combining sum and sumsq and count, you can produce powerful “z-score” features or other distributional features using a post-transform.
|
||||
|
||||
It is also possible to add your own aggregate operators (e.g. `LastResetMetric <https://phabricator.twitter.biz/D228537>`_) to the framework with some additional work.
|
||||
|
||||
HalfLives
|
||||
---------
|
||||
|
||||
`halfLives` specifies how fast aggregate features should be decayed. It is important to note that the framework works on an incremental basis: in the batch implementation, the summingbird-scalding job takes in the most recently computed aggregate features, processed on data until day `N-1`, then reads new data records for day `N` and computes updated values of the aggregate features. Similarly, the decay of real-time aggregate features takes the actual time delta between the current time and the last time the aggregate feature value was updated.
|
||||
|
||||
The halflife `H` specifies how fast to decay old sums/counts to simulate a sliding window of counts. The implementation is such that it will take `H` amount of time to decay an aggregate feature to half its initial value. New observed values of sums/counts are added to the aggregate feature value.
|
||||
|
||||
.. admonition:: Batch and real-time
|
||||
|
||||
In the batch use case where aggregate features are recomputed on a daily basis, we typically take halflives on the order of weeks or longer (in Timelines, 50 days). In the real-time use case, shorter halflives are appropriate (hours) since they are updated as client engagements are received by the summingbird job.
|
||||
|
||||
|
||||
SQL Equivalent
|
||||
--------------
|
||||
Conceptually, you can also think of it as:
|
||||
|
||||
.. code-block:: sql
|
||||
|
||||
INSERT INTO <outputStore>.<aggregatePrefix>
|
||||
SELECT AGG(<features>) /* AGG is <metrics>, which is a exponentially decaying SUM or COUNT etc. based on the halfLifves */
|
||||
FROM (
|
||||
SELECT preTransformOpt(*) FROM <inputSource>
|
||||
)
|
||||
GROUP BY <keys>
|
||||
WHERE <labels> = True
|
||||
|
||||
any_features is AGG(*).
|
||||
|
||||
any_labels removes the WHERE clause.
|
|
@ -0,0 +1,215 @@
|
|||
.. _batch:
|
||||
|
||||
Batch aggregate feature jobs
|
||||
============================
|
||||
|
||||
In the previous section, we went over the core concepts of the aggregation framework and discussed how you can set up you own `AggregateGroups` to compute aggregate features.
|
||||
|
||||
Given these groups, this section will discuss how you can setup offline batch jobs to produce the corresponding aggregate features, updated daily. To accomplish this, we need to setup a summingbird-scalding job that is pointed to the input data records containing features and labels to be aggregated.
|
||||
|
||||
Input Data
|
||||
----------
|
||||
|
||||
In order to generate aggregate features, the relevant input features need to be available offline as a daily scalding source in `DataRecord` format (typically `DailySuffixFeatureSource <https://cgit.twitter.biz/source/tree/src/scala/com/twitter/ml/api/FeatureSource.scala>`_, though `HourlySuffixFeatureSource` could also be usable but we have not tested this).
|
||||
|
||||
.. admonition:: Note
|
||||
|
||||
The input data source should contain the keys, features and labels you want to use in your `AggregateGroups`.
|
||||
|
||||
Aggregation Config
|
||||
------------------
|
||||
|
||||
Now that we have a daily data source with input features and labels, we need to setup the `AggregateGroup` config itself. This contains all aggregation groups that you would like to compute and we will go through the implementation step-by-step.
|
||||
|
||||
.. admonition:: Example: Timelines Quality config
|
||||
|
||||
`TimelinesAggregationConfig <https://cgit.twitter.biz/source/tree/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfig.scala>`_ imports the configured `AggregationGroups` from `TimelinesAggregationConfigDetails <https://cgit.twitter.biz/source/tree/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigDetails.scala>`_. The config is then referenced by the implementing summingbird-scalding job which we will setup below.
|
||||
|
||||
OfflineAggregateSource
|
||||
----------------------
|
||||
|
||||
Each `AggregateGroup` will need to define a (daily) source of input features. We use `OfflineAggregateSource` for this to tell the aggregation framework where the input data set is and the required timestamp feature that the framework uses to decay aggregate feature values:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
val timelinesDailyRecapSource = OfflineAggregateSource(
|
||||
name = "timelines_daily_recap",
|
||||
timestampFeature = TIMESTAMP,
|
||||
scaldingHdfsPath = Some("/user/timelines/processed/suggests/recap/data_records"),
|
||||
scaldingSuffixType = Some("daily"),
|
||||
withValidation = true
|
||||
)
|
||||
|
||||
.. admonition:: Note
|
||||
|
||||
.. cssclass:: shortlist
|
||||
|
||||
#. The name is not important as long as it is unique.
|
||||
|
||||
#. `timestampFeature` must be a discrete feature of type `com.twitter.ml.api.Feature[Long]` and represents the “time” of a given training record in milliseconds - for example, the time at which an engagement, push open event, or abuse event took place that you are trying to train on. If you do not already have such a feature in your daily training data, you need to add one.
|
||||
|
||||
#. `scaldingSuffixType` can be “hourly” or “daily” depending on the type of source (`HourlySuffixFeatureSource` vs `DailySuffixFeatureSource`).
|
||||
|
||||
#. Set `withValidation` to true to validate the presence of _SUCCESS file. Context: https://jira.twitter.biz/browse/TQ-10618
|
||||
|
||||
Output HDFS store
|
||||
-----------------
|
||||
|
||||
The output HDFS store is where the computed aggregate features are stored. This store contains all computed aggregate feature values and is incrementally updated by the aggregates job every day.
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
val outputHdfsPath = "/user/timelines/processed/aggregates_v2"
|
||||
val timelinesOfflineAggregateSink = new OfflineStoreCommonConfig {
|
||||
override def apply(startDate: String) = new OfflineAggregateStoreCommonConfig(
|
||||
outputHdfsPathPrefix = outputHdfsPath,
|
||||
dummyAppId = "timelines_aggregates_v2_ro", // unused - can be arbitrary
|
||||
dummyDatasetPrefix = "timelines_aggregates_v2_ro", // unused - can be arbitrary
|
||||
startDate = startDate
|
||||
)
|
||||
}
|
||||
|
||||
Note: `dummyAppId` and `dummyDatasetPrefix` are unused so can be set to any arbitrary value. They should be removed on the framework side.
|
||||
|
||||
The `outputHdfsPathPrefix` is the only field that matters, and should be set to the HDFS path where you want to store the aggregate features. Make sure you have a lot of quota available at that path.
|
||||
|
||||
Setting Up Aggregates Job
|
||||
-------------------------
|
||||
|
||||
Once you have defined a config file with the aggregates you would like to compute, the next step is to create the aggregates scalding job using the config (`example <https://cgit.twitter.biz/source/tree/timelines/data_processing/ad_hoc/aggregate_interactions/v2/offline_aggregation/TimelinesAggregationScaldingJob.scala>`_). This is very concise and requires only a few lines of code:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
object TimelinesAggregationScaldingJob extends AggregatesV2ScaldingJob {
|
||||
override val aggregatesToCompute = TimelinesAggregationConfig.aggregatesToCompute
|
||||
}
|
||||
|
||||
Now that the scalding job is implemented with the aggregation config, we need to setup a capesos config similar to https://cgit.twitter.biz/source/tree/science/scalding/mesos/timelines/prod.yml:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
# Common configuration shared by all aggregates v2 jobs
|
||||
__aggregates_v2_common__: &__aggregates_v2_common__
|
||||
class: HadoopSummingbirdProducer
|
||||
bundle: offline_aggregation-deploy.tar.gz
|
||||
mainjar: offline_aggregation-deploy.jar
|
||||
pants_target: "bundle timelines/data_processing/ad_hoc/aggregate_interactions/v2/offline_aggregation:bin"
|
||||
cron_collision_policy: CANCEL_NEW
|
||||
use_libjar_wild_card: true
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
# Specific job computing user aggregates
|
||||
user_aggregates_v2:
|
||||
<<: *__aggregates_v2_common__
|
||||
cron_schedule: "25 * * * *"
|
||||
arguments: --batches 1 --output_stores user_aggregates --job_name timelines_user_aggregates_v2
|
||||
|
||||
.. admonition:: Important
|
||||
|
||||
Each AggregateGroup in your config should have its own associated offline job which specifies `output_stores` pointing to the output store name you defined in your config.
|
||||
|
||||
Running The Job
|
||||
---------------
|
||||
|
||||
When you run the batch job for the first time, you need to add a temporary entry to your capesos yml file that looks like this:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
user_aggregates_v2_initial_run:
|
||||
<<: *__aggregates_v2_common__
|
||||
cron_schedule: "25 * * * *"
|
||||
arguments: --batches 1 --start-time “2017-03-03 00:00:00” --output_stores user_aggregates --job_name timelines_user_aggregates_v2
|
||||
|
||||
.. admonition:: Start Time
|
||||
|
||||
The additional `--start-time` argument should match the `startDate` in your config for that AggregateGroup, but in the format `yyyy-mm-dd hh:mm:ss`.
|
||||
|
||||
To invoke the initial run via capesos, we would do the following (in Timelines case):
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
CAPESOSPY_ENV=prod capesospy-v2 update --build_locally --start_cron user_aggregates_v2_initial_run science/scalding/mesos/timelines/prod.yml
|
||||
|
||||
Once it is running smoothly, you can deschedule the initial run job and delete the temporary entry from your production yml config.
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
aurora cron deschedule atla/timelines/prod/user_aggregates_v2_initial_run
|
||||
|
||||
Note: deschedule it preemptively to avoid repeatedly overwriting the same initial results
|
||||
|
||||
Then schedule the production job from jenkins using something like this:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
CAPESOSPY_ENV=prod capesospy-v2 update user_aggregates_v2 science/scalding/mesos/timelines/prod.yml
|
||||
|
||||
All future runs (2nd onwards) will use the permanent entry in the capesos yml config that does not have the `start-time` specified.
|
||||
|
||||
.. admonition:: Job name has to match
|
||||
|
||||
It's important that the production run should share the same `--job_name` with the initial_run so that eagleeye/statebird knows how to keep track of it correctly.
|
||||
|
||||
Output Aggregate Features
|
||||
-------------------------
|
||||
|
||||
This scalding job using the example config from the earlier section would output a VersionedKeyValSource to `/user/timelines/processed/aggregates_v2/user_aggregates` on HDFS.
|
||||
|
||||
Note that `/user/timelines/processed/aggregates_v2` is the explicitly defined root path while `user_aggregates` is the output directory of the example `AggregateGroup` defined earlier. The latter can be different for different `AggregateGroups` defined in your config.
|
||||
|
||||
|
||||
The VersionedKeyValSource is difficult to use directly in your jobs/offline trainings, but we provide an adapted source `AggregatesV2FeatureSource` that makes it easy to join and use in your jobs:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion._
|
||||
|
||||
val pipe: DataSetPipe = AggregatesV2FeatureSource(
|
||||
rootPath = "/user/timelines/processed/aggregates_v2",
|
||||
storeName = "user_aggregates",
|
||||
aggregates = TimelinesAggregationConfig.aggregatesToCompute,
|
||||
trimThreshold = 0
|
||||
)(dateRange).read
|
||||
|
||||
Simply replace the `rootPath`, `storeName` and `aggregates` object to whatever you defined. The `trimThreshold` tells the framework to trim all features below a certain cutoff: 0 is a safe default to use to begin with.
|
||||
|
||||
.. admonition:: Usage
|
||||
|
||||
This can now be used like any other `DataSetPipe` in offline ML jobs. You can write out the features to a `DailySuffixFeatureSource`, you can join them with your data offline for trainings, or you can write them to a Manhattan store for serving online.
|
||||
|
||||
Aggregate Features Example
|
||||
--------------------------
|
||||
|
||||
Here is an example of sample of the aggregate features we just computed:
|
||||
|
||||
.. code-block:: scala
|
||||
|
||||
user_aggregate_v2.pair.any_label.any_feature.50.days.count: 100.0
|
||||
user_aggregate_v2.pair.any_label.tweetsource.is_quote.50.days.count: 30.0
|
||||
user_aggregate_v2.pair.is_favorited.any_feature.50.days.count: 10.0
|
||||
user_aggregate_v2.pair.is_favorited.tweetsource.is_quote.50.days.count: 6.0
|
||||
meta.user_id: 123456789
|
||||
|
||||
Aggregate feature names match a `prefix.pair.label.feature.half_life.metric` schema and correspond to what was defined in the aggregation config for each of these fields.
|
||||
|
||||
.. admonition:: Example
|
||||
|
||||
In this example, the above features are capturing that userId 123456789L has:
|
||||
|
||||
..
|
||||
A 50-day decayed count of 100 training records with any label or feature (“tweet impressions”)
|
||||
|
||||
A 50-day decayed count of 30 records that are “quote tweets” (tweetsource.is_quote = true)
|
||||
|
||||
A 50-day decayed count of 10 records that are favorites on any type of tweet (is_favorited = true)
|
||||
|
||||
A 50-day decayed count of 6 records that are “favorites” on “quote tweets” (both of the above are true)
|
||||
|
||||
By combining the above, a model might infer that for this specific user, quote tweets comprise 30% of all impressions, have a favorite rate of 6/30 = 20%, compared to a favorite rate of 10/100 = 10% on the total population of tweets.
|
||||
|
||||
Therefore, being a quote tweet makes this specific user `123456789L` approximately twice as likely to favorite the tweet, which is useful for prediction and could result in the ML model giving higher scores to & ranking quote tweets higher in a personalized fashion for this user.
|
||||
|
||||
Tests for Feature Names
|
||||
--------------------------
|
||||
When you change or add AggregateGroup, feature names might change. And the Feature Store provides a testing mechanism to assert that the feature names change as you expect. See `tests for feature names <https://docbird.twitter.biz/ml_feature_store/catalog.html#tests-for-feature-names>`_.
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue