diff --git a/README.md b/README.md index 79a7e61356..691c3a96c6 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ Product surfaces at Twitter are built on a shared set of data, models, and softw | | [topic-social-proof](topic-social-proof/README.md) | Identifies topics related to individual Tweets. | | Software framework | [navi](navi/README.md) | High performance, machine learning model serving written in Rust. | | | [product-mixer](product-mixer/README.md) | Software framework for building feeds of content. | +| | [timelines-aggregation-framework](timelines/data_processing/ml_util/aggregation_framework/README.md) | Framework for generating aggregate features in batch or real time. | | [twml](twml/README.md) | Legacy machine learning framework built on TensorFlow v1. | The product surface currently included in this repository is the For You Timeline. diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/BCELabelTransformFromUUADataRecord.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/BCELabelTransformFromUUADataRecord.scala new file mode 100644 index 0000000000..6adf6eaf8f --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/BCELabelTransformFromUUADataRecord.scala @@ -0,0 +1,68 @@ +package com.twitter.timelines.prediction.common.aggregates + +import com.twitter.ml.api.Feature +import com.twitter.ml.api.FeatureContext +import com.twitter.ml.api.ITransform +import com.twitter.ml.api.constant.SharedFeatures +import java.lang.{Double => JDouble} + +import com.twitter.timelines.prediction.common.adapters.AdapterConsumer +import com.twitter.timelines.prediction.common.adapters.EngagementLabelFeaturesDataRecordUtils +import com.twitter.ml.api.DataRecord +import com.twitter.ml.api.RichDataRecord +import com.twitter.timelines.suggests.common.engagement.thriftscala.EngagementType +import com.twitter.timelines.suggests.common.engagement.thriftscala.Engagement +import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures +import com.twitter.timelines.prediction.features.common.CombinedFeatures + +/** + * To transfrom BCE events UUA data records that contain only continuous dwell time to datarecords that contain corresponding binary label features + * The UUA datarecords inputted would have USER_ID, SOURCE_TWEET_ID,TIMESTAMP and + * 0 or one of (TWEET_DETAIL_DWELL_TIME_MS, PROFILE_DWELL_TIME_MS, FULLSCREEN_VIDEO_DWELL_TIME_MS) features. + * We will use the different engagement TIME_MS to differentiate different engagements, + * and then re-use the function in EngagementTypeConverte to add the binary label to the datarecord. + **/ + +object BCELabelTransformFromUUADataRecord extends ITransform { + + val dwellTimeFeatureToEngagementMap = Map( + TimelinesSharedFeatures.TWEET_DETAIL_DWELL_TIME_MS -> EngagementType.TweetDetailDwell, + TimelinesSharedFeatures.PROFILE_DWELL_TIME_MS -> EngagementType.ProfileDwell, + TimelinesSharedFeatures.FULLSCREEN_VIDEO_DWELL_TIME_MS -> EngagementType.FullscreenVideoDwell + ) + + def dwellFeatureToEngagement( + rdr: RichDataRecord, + dwellTimeFeature: Feature[JDouble], + engagementType: EngagementType + ): Option[Engagement] = { + if (rdr.hasFeature(dwellTimeFeature)) { + Some( + Engagement( + engagementType = engagementType, + timestampMs = rdr.getFeatureValue(SharedFeatures.TIMESTAMP), + weight = Some(rdr.getFeatureValue(dwellTimeFeature)) + )) + } else { + None + } + } + override def transformContext(featureContext: FeatureContext): FeatureContext = { + featureContext.addFeatures( + (CombinedFeatures.TweetDetailDwellEngagements ++ CombinedFeatures.ProfileDwellEngagements ++ CombinedFeatures.FullscreenVideoDwellEngagements).toSeq: _*) + } + override def transform(record: DataRecord): Unit = { + val rdr = new RichDataRecord(record) + val engagements = dwellTimeFeatureToEngagementMap + .map { + case (dwellTimeFeature, engagementType) => + dwellFeatureToEngagement(rdr, dwellTimeFeature, engagementType) + }.flatten.toSeq + + // Re-use BCE( behavior client events) label conversion in EngagementTypeConverter to align with BCE labels generation for offline training data + EngagementLabelFeaturesDataRecordUtils.setDwellTimeFeatures( + rdr, + Some(engagements), + AdapterConsumer.Combined) + } +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/BUILD b/src/scala/com/twitter/timelines/prediction/common/aggregates/BUILD new file mode 100644 index 0000000000..01c930e8ed --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/BUILD @@ -0,0 +1,353 @@ +create_datasets( + base_name = "original_author_aggregates", + fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/original_author_aggregates/1556496000000", + key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", + platform = "java8", + role = "timelines", + scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.OriginalAuthor", + segment_type = "snapshot", + tags = ["bazel-compatible"], + val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", + scala_dependencies = [ + ":injections", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + ], +) + +create_datasets( + base_name = "twitter_wide_user_aggregates", + fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/twitter_wide_user_aggregates/1556496000000", + key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", + platform = "java8", + role = "timelines", + scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.TwitterWideUser", + segment_type = "snapshot", + tags = ["bazel-compatible"], + val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", + scala_dependencies = [ + ":injections", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + ], +) + +create_datasets( + base_name = "twitter_wide_user_author_aggregates", + fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/twitter_wide_user_author_aggregates/1556323200000", + key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", + platform = "java8", + role = "timelines", + scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.TwitterWideUserAuthor", + segment_type = "snapshot", + tags = ["bazel-compatible"], + val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", + scala_dependencies = [ + ":injections", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + ], +) + +create_datasets( + base_name = "user_aggregates", + fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_aggregates/1556150400000", + key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", + platform = "java8", + role = "timelines", + scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.User", + segment_type = "snapshot", + tags = ["bazel-compatible"], + val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", + scala_dependencies = [ + ":injections", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + ], +) + +create_datasets( + base_name = "user_author_aggregates", + fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_author_aggregates/1556064000000", + key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", + platform = "java8", + role = "timelines", + scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserAuthor", + segment_type = "snapshot", + tags = ["bazel-compatible"], + val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", + scala_dependencies = [ + ":injections", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + ], +) + +create_datasets( + base_name = "aggregates_canary", + fallback_path = "gs://user.timelines.dp.gcp.twttr.net//canaries/processed/aggregates_v2/user_aggregates/1622851200000", + key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", + platform = "java8", + role = "timelines", + scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.User", + segment_type = "snapshot", + tags = ["bazel-compatible"], + val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", + scala_dependencies = [ + ":injections", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + ], +) + +create_datasets( + base_name = "user_engager_aggregates", + fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_engager_aggregates/1556496000000", + key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", + platform = "java8", + role = "timelines", + scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserEngager", + segment_type = "snapshot", + tags = ["bazel-compatible"], + val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", + scala_dependencies = [ + ":injections", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + ], +) + +create_datasets( + base_name = "user_original_author_aggregates", + fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_original_author_aggregates/1556496000000", + key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", + platform = "java8", + role = "timelines", + scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserOriginalAuthor", + segment_type = "snapshot", + tags = ["bazel-compatible"], + val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", + scala_dependencies = [ + ":injections", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + ], +) + +create_datasets( + base_name = "author_topic_aggregates", + fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/author_topic_aggregates/1589932800000", + key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", + platform = "java8", + role = "timelines", + scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.AuthorTopic", + segment_type = "snapshot", + tags = ["bazel-compatible"], + val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", + scala_dependencies = [ + ":injections", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + ], +) + +create_datasets( + base_name = "user_topic_aggregates", + fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_topic_aggregates/1590278400000", + key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", + platform = "java8", + role = "timelines", + scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserTopic", + segment_type = "snapshot", + tags = ["bazel-compatible"], + val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", + scala_dependencies = [ + ":injections", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + ], +) + +create_datasets( + base_name = "user_inferred_topic_aggregates", + fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_inferred_topic_aggregates/1599696000000", + key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", + platform = "java8", + role = "timelines", + scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserInferredTopic", + segment_type = "snapshot", + tags = ["bazel-compatible"], + val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", + scala_dependencies = [ + ":injections", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + ], +) + +create_datasets( + base_name = "user_mention_aggregates", + fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_mention_aggregates/1556582400000", + key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", + platform = "java8", + role = "timelines", + scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserMention", + segment_type = "snapshot", + tags = ["bazel-compatible"], + val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", + scala_dependencies = [ + ":injections", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + ], +) + +create_datasets( + base_name = "user_request_dow_aggregates", + fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_request_dow_aggregates/1556236800000", + key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", + platform = "java8", + role = "timelines", + scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserRequestDow", + segment_type = "snapshot", + tags = ["bazel-compatible"], + val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", + scala_dependencies = [ + ":injections", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + ], +) + +create_datasets( + base_name = "user_request_hour_aggregates", + fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_request_hour_aggregates/1556150400000", + key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", + platform = "java8", + role = "timelines", + scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserRequestHour", + segment_type = "snapshot", + tags = ["bazel-compatible"], + val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", + scala_dependencies = [ + ":injections", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + ], +) + + +create_datasets( + base_name = "user_list_aggregates", + fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_list_aggregates/1590624000000", + key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", + platform = "java8", + role = "timelines", + scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserList", + segment_type = "snapshot", + tags = ["bazel-compatible"], + val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", + scala_dependencies = [ + ":injections", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + ], +) + + +create_datasets( + base_name = "user_media_understanding_annotation_aggregates", + key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", + platform = "java8", + role = "timelines", + scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserMediaUnderstandingAnnotation", + segment_type = "snapshot", + tags = ["bazel-compatible"], + val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", + scala_dependencies = [ + ":injections", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + ], +) + +scala_library( + sources = [ + "BCELabelTransformFromUUADataRecord.scala", + "FeatureSelectorConfig.scala", + "RecapUserFeatureAggregation.scala", + "RectweetUserFeatureAggregation.scala", + "TimelinesAggregationConfig.scala", + "TimelinesAggregationConfigDetails.scala", + "TimelinesAggregationConfigTrait.scala", + "TimelinesAggregationSources.scala", + ], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + ":aggregates_canary-scala", + ":author_topic_aggregates-scala", + ":original_author_aggregates-scala", + ":twitter_wide_user_aggregates-scala", + ":twitter_wide_user_author_aggregates-scala", + ":user_aggregates-scala", + ":user_author_aggregates-scala", + ":user_engager_aggregates-scala", + ":user_inferred_topic_aggregates-scala", + ":user_list_aggregates-scala", + ":user_media_understanding_annotation_aggregates-scala", + ":user_mention_aggregates-scala", + ":user_original_author_aggregates-scala", + ":user_request_dow_aggregates-scala", + ":user_request_hour_aggregates-scala", + ":user_topic_aggregates-scala", + "src/java/com/twitter/ml/api:api-base", + "src/java/com/twitter/ml/api/constant", + "src/java/com/twitter/ml/api/matcher", + "src/scala/com/twitter/common/text/util", + "src/scala/com/twitter/dal/client/dataset", + "src/scala/com/twitter/frigate/data_pipeline/features_aggregated/core", + "src/scala/com/twitter/scalding_internal/multiformat/format", + "src/scala/com/twitter/timelines/prediction/common/adapters:engagement-converter", + "src/scala/com/twitter/timelines/prediction/features/client_log_event", + "src/scala/com/twitter/timelines/prediction/features/common", + "src/scala/com/twitter/timelines/prediction/features/engagement_features", + "src/scala/com/twitter/timelines/prediction/features/escherbird", + "src/scala/com/twitter/timelines/prediction/features/itl", + "src/scala/com/twitter/timelines/prediction/features/list_features", + "src/scala/com/twitter/timelines/prediction/features/p_home_latest", + "src/scala/com/twitter/timelines/prediction/features/real_graph", + "src/scala/com/twitter/timelines/prediction/features/recap", + "src/scala/com/twitter/timelines/prediction/features/request_context", + "src/scala/com/twitter/timelines/prediction/features/simcluster", + "src/scala/com/twitter/timelines/prediction/features/time_features", + "src/scala/com/twitter/timelines/prediction/transform/filter", + "src/thrift/com/twitter/timelines/suggests/common:engagement-scala", + "timelines/data_processing/ad_hoc/recap/data_record_preparation:recap_data_records_agg_minimal-java", + "util/util-core:scala", + ], +) + +scala_library( + name = "injections", + sources = [ + "FeatureSelectorConfig.scala", + "RecapUserFeatureAggregation.scala", + "RectweetUserFeatureAggregation.scala", + "TimelinesAggregationConfigDetails.scala", + "TimelinesAggregationConfigTrait.scala", + "TimelinesAggregationKeyValInjections.scala", + "TimelinesAggregationSources.scala", + ], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "src/java/com/twitter/ml/api:api-base", + "src/java/com/twitter/ml/api/constant", + "src/java/com/twitter/ml/api/matcher", + "src/scala/com/twitter/common/text/util", + "src/scala/com/twitter/dal/client/dataset", + "src/scala/com/twitter/frigate/data_pipeline/features_aggregated/core", + "src/scala/com/twitter/scalding_internal/multiformat/format", + "src/scala/com/twitter/timelines/prediction/features/client_log_event", + "src/scala/com/twitter/timelines/prediction/features/common", + "src/scala/com/twitter/timelines/prediction/features/engagement_features", + "src/scala/com/twitter/timelines/prediction/features/escherbird", + "src/scala/com/twitter/timelines/prediction/features/itl", + "src/scala/com/twitter/timelines/prediction/features/list_features", + "src/scala/com/twitter/timelines/prediction/features/p_home_latest", + "src/scala/com/twitter/timelines/prediction/features/real_graph", + "src/scala/com/twitter/timelines/prediction/features/recap", + "src/scala/com/twitter/timelines/prediction/features/request_context", + "src/scala/com/twitter/timelines/prediction/features/semantic_core_features", + "src/scala/com/twitter/timelines/prediction/features/simcluster", + "src/scala/com/twitter/timelines/prediction/features/time_features", + "src/scala/com/twitter/timelines/prediction/transform/filter", + "timelines/data_processing/ad_hoc/recap/data_record_preparation:recap_data_records_agg_minimal-java", + "util/util-core:scala", + ], +) diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/FeatureSelectorConfig.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/FeatureSelectorConfig.scala new file mode 100644 index 0000000000..1c91ef16c7 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/FeatureSelectorConfig.scala @@ -0,0 +1,121 @@ +package com.twitter.timelines.prediction.common.aggregates + +import com.twitter.ml.api.matcher.FeatureMatcher +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup +import scala.collection.JavaConverters._ + +object FeatureSelectorConfig { + val BasePairsToStore = Seq( + ("twitter_wide_user_aggregate.pair", "*"), + ("twitter_wide_user_author_aggregate.pair", "*"), + ("user_aggregate_v5.continuous.pair", "*"), + ("user_aggregate_v7.pair", "*"), + ("user_author_aggregate_v2.pair", "recap.earlybird.*"), + ("user_author_aggregate_v2.pair", "recap.searchfeature.*"), + ("user_author_aggregate_v2.pair", "recap.tweetfeature.embeds*"), + ("user_author_aggregate_v2.pair", "recap.tweetfeature.link_count*"), + ("user_author_aggregate_v2.pair", "engagement_features.in_network.*"), + ("user_author_aggregate_v2.pair", "recap.tweetfeature.is_reply.*"), + ("user_author_aggregate_v2.pair", "recap.tweetfeature.is_retweet.*"), + ("user_author_aggregate_v2.pair", "recap.tweetfeature.num_mentions.*"), + ("user_author_aggregate_v5.pair", "*"), + ("user_author_aggregate_tweetsource_v1.pair", "*"), + ("user_engager_aggregate.pair", "*"), + ("user_mention_aggregate.pair", "*"), + ("user_request_context_aggregate.dow.pair", "*"), + ("user_request_context_aggregate.hour.pair", "*"), + ("user_aggregate_v6.pair", "*"), + ("user_original_author_aggregate_v1.pair", "*"), + ("user_original_author_aggregate_v2.pair", "*"), + ("original_author_aggregate_v1.pair", "*"), + ("original_author_aggregate_v2.pair", "*"), + ("author_topic_aggregate.pair", "*"), + ("user_list_aggregate.pair", "*"), + ("user_topic_aggregate.pair", "*"), + ("user_topic_aggregate_v2.pair", "*"), + ("user_inferred_topic_aggregate.pair", "*"), + ("user_inferred_topic_aggregate_v2.pair", "*"), + ("user_media_annotation_aggregate.pair", "*"), + ("user_media_annotation_aggregate.pair", "*"), + ("user_author_good_click_aggregate.pair", "*"), + ("user_engager_good_click_aggregate.pair", "*") + ) + val PairsToStore = BasePairsToStore ++ Seq( + ("user_aggregate_v2.pair", "*"), + ("user_aggregate_v5.boolean.pair", "*"), + ("user_aggregate_tweetsource_v1.pair", "*"), + ) + + + val LabelsToStore = Seq( + "any_label", + "recap.engagement.is_favorited", + "recap.engagement.is_retweeted", + "recap.engagement.is_replied", + "recap.engagement.is_open_linked", + "recap.engagement.is_profile_clicked", + "recap.engagement.is_clicked", + "recap.engagement.is_photo_expanded", + "recap.engagement.is_video_playback_50", + "recap.engagement.is_video_quality_viewed", + "recap.engagement.is_replied_reply_impressed_by_author", + "recap.engagement.is_replied_reply_favorited_by_author", + "recap.engagement.is_replied_reply_replied_by_author", + "recap.engagement.is_report_tweet_clicked", + "recap.engagement.is_block_clicked", + "recap.engagement.is_mute_clicked", + "recap.engagement.is_dont_like", + "recap.engagement.is_good_clicked_convo_desc_favorited_or_replied", + "recap.engagement.is_good_clicked_convo_desc_v2", + "itl.engagement.is_favorited", + "itl.engagement.is_retweeted", + "itl.engagement.is_replied", + "itl.engagement.is_open_linked", + "itl.engagement.is_profile_clicked", + "itl.engagement.is_clicked", + "itl.engagement.is_photo_expanded", + "itl.engagement.is_video_playback_50" + ) + + val PairGlobsToStore = for { + (prefix, suffix) <- PairsToStore + label <- LabelsToStore + } yield FeatureMatcher.glob(prefix + "." + label + "." + suffix) + + val BaseAggregateV2FeatureSelector = FeatureMatcher + .none() + .or( + FeatureMatcher.glob("meta.user_id"), + FeatureMatcher.glob("meta.author_id"), + FeatureMatcher.glob("entities.original_author_id"), + FeatureMatcher.glob("entities.topic_id"), + FeatureMatcher + .glob("entities.inferred_topic_ids" + TypedAggregateGroup.SparseFeatureSuffix), + FeatureMatcher.glob("timelines.meta.list_id"), + FeatureMatcher.glob("list.id"), + FeatureMatcher + .glob("engagement_features.user_ids.public" + TypedAggregateGroup.SparseFeatureSuffix), + FeatureMatcher + .glob("entities.users.mentioned_screen_names" + TypedAggregateGroup.SparseFeatureSuffix), + FeatureMatcher.glob("user_aggregate_v2.pair.recap.engagement.is_dont_like.*"), + FeatureMatcher.glob("user_author_aggregate_v2.pair.any_label.recap.tweetfeature.has_*"), + FeatureMatcher.glob("request_context.country_code"), + FeatureMatcher.glob("request_context.timestamp_gmt_dow"), + FeatureMatcher.glob("request_context.timestamp_gmt_hour"), + FeatureMatcher.glob( + "semantic_core.media_understanding.high_recall.non_sensitive.entity_ids" + TypedAggregateGroup.SparseFeatureSuffix) + ) + + val AggregatesV2ProdFeatureSelector = BaseAggregateV2FeatureSelector + .orList(PairGlobsToStore.asJava) + + val ReducedPairGlobsToStore = (for { + (prefix, suffix) <- BasePairsToStore + label <- LabelsToStore + } yield FeatureMatcher.glob(prefix + "." + label + "." + suffix)) ++ Seq( + FeatureMatcher.glob("user_aggregate_v2.pair.any_label.*"), + FeatureMatcher.glob("user_aggregate_v2.pair.recap.engagement.is_favorited.*"), + FeatureMatcher.glob("user_aggregate_v2.pair.recap.engagement.is_photo_expanded.*"), + FeatureMatcher.glob("user_aggregate_v2.pair.recap.engagement.is_profile_clicked.*") + ) +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/README.md b/src/scala/com/twitter/timelines/prediction/common/aggregates/README.md new file mode 100644 index 0000000000..0bae21a14a --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/README.md @@ -0,0 +1,6 @@ +## Timelines Aggregation Jobs + +This directory contains the specific definition of aggregate jobs that generate features used by the Heavy Ranker. +The primary files of interest are [`TimelinesAggregationConfigDetails.scala`](TimelinesAggregationConfigDetails.scala), which contains the defintion for the batch aggregate jobs and [`real_time/TimelinesOnlineAggregationConfigBase.scala`](real_time/TimelinesOnlineAggregationConfigBase.scala) which contains the definitions for the real time aggregate jobs. + +The aggregation framework that these jobs are based on is [here](../../../../../../../../timelines/data_processing/ml_util/aggregation_framework). \ No newline at end of file diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/RecapUserFeatureAggregation.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/RecapUserFeatureAggregation.scala new file mode 100644 index 0000000000..657d5a713b --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/RecapUserFeatureAggregation.scala @@ -0,0 +1,415 @@ +package com.twitter.timelines.prediction.common.aggregates + +import com.twitter.ml.api.Feature +import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures +import com.twitter.timelines.prediction.features.engagement_features.EngagementDataRecordFeatures +import com.twitter.timelines.prediction.features.real_graph.RealGraphDataRecordFeatures +import com.twitter.timelines.prediction.features.recap.RecapFeatures +import com.twitter.timelines.prediction.features.time_features.TimeDataRecordFeatures + +object RecapUserFeatureAggregation { + val RecapFeaturesForAggregation: Set[Feature[_]] = + Set( + RecapFeatures.HAS_IMAGE, + RecapFeatures.HAS_VIDEO, + RecapFeatures.FROM_MUTUAL_FOLLOW, + RecapFeatures.HAS_CARD, + RecapFeatures.HAS_NEWS, + RecapFeatures.REPLY_COUNT, + RecapFeatures.FAV_COUNT, + RecapFeatures.RETWEET_COUNT, + RecapFeatures.BLENDER_SCORE, + RecapFeatures.CONVERSATIONAL_COUNT, + RecapFeatures.IS_BUSINESS_SCORE, + RecapFeatures.CONTAINS_MEDIA, + RecapFeatures.RETWEET_SEARCHER, + RecapFeatures.REPLY_SEARCHER, + RecapFeatures.MENTION_SEARCHER, + RecapFeatures.REPLY_OTHER, + RecapFeatures.RETWEET_OTHER, + RecapFeatures.MATCH_UI_LANG, + RecapFeatures.MATCH_SEARCHER_MAIN_LANG, + RecapFeatures.MATCH_SEARCHER_LANGS, + RecapFeatures.TWEET_COUNT_FROM_USER_IN_SNAPSHOT, + RecapFeatures.TEXT_SCORE, + RealGraphDataRecordFeatures.NUM_RETWEETS_EWMA, + RealGraphDataRecordFeatures.NUM_RETWEETS_NON_ZERO_DAYS, + RealGraphDataRecordFeatures.NUM_RETWEETS_ELAPSED_DAYS, + RealGraphDataRecordFeatures.NUM_RETWEETS_DAYS_SINCE_LAST, + RealGraphDataRecordFeatures.NUM_FAVORITES_EWMA, + RealGraphDataRecordFeatures.NUM_FAVORITES_NON_ZERO_DAYS, + RealGraphDataRecordFeatures.NUM_FAVORITES_ELAPSED_DAYS, + RealGraphDataRecordFeatures.NUM_FAVORITES_DAYS_SINCE_LAST, + RealGraphDataRecordFeatures.NUM_MENTIONS_EWMA, + RealGraphDataRecordFeatures.NUM_MENTIONS_NON_ZERO_DAYS, + RealGraphDataRecordFeatures.NUM_MENTIONS_ELAPSED_DAYS, + RealGraphDataRecordFeatures.NUM_MENTIONS_DAYS_SINCE_LAST, + RealGraphDataRecordFeatures.NUM_TWEET_CLICKS_EWMA, + RealGraphDataRecordFeatures.NUM_TWEET_CLICKS_NON_ZERO_DAYS, + RealGraphDataRecordFeatures.NUM_TWEET_CLICKS_ELAPSED_DAYS, + RealGraphDataRecordFeatures.NUM_TWEET_CLICKS_DAYS_SINCE_LAST, + RealGraphDataRecordFeatures.NUM_PROFILE_VIEWS_EWMA, + RealGraphDataRecordFeatures.NUM_PROFILE_VIEWS_NON_ZERO_DAYS, + RealGraphDataRecordFeatures.NUM_PROFILE_VIEWS_ELAPSED_DAYS, + RealGraphDataRecordFeatures.NUM_PROFILE_VIEWS_DAYS_SINCE_LAST, + RealGraphDataRecordFeatures.TOTAL_DWELL_TIME_EWMA, + RealGraphDataRecordFeatures.TOTAL_DWELL_TIME_NON_ZERO_DAYS, + RealGraphDataRecordFeatures.TOTAL_DWELL_TIME_ELAPSED_DAYS, + RealGraphDataRecordFeatures.TOTAL_DWELL_TIME_DAYS_SINCE_LAST, + RealGraphDataRecordFeatures.NUM_INSPECTED_TWEETS_EWMA, + RealGraphDataRecordFeatures.NUM_INSPECTED_TWEETS_NON_ZERO_DAYS, + RealGraphDataRecordFeatures.NUM_INSPECTED_TWEETS_ELAPSED_DAYS, + RealGraphDataRecordFeatures.NUM_INSPECTED_TWEETS_DAYS_SINCE_LAST + ) + + val RecapLabelsForAggregation: Set[Feature.Binary] = + Set( + RecapFeatures.IS_FAVORITED, + RecapFeatures.IS_RETWEETED, + RecapFeatures.IS_CLICKED, + RecapFeatures.IS_PROFILE_CLICKED, + RecapFeatures.IS_OPEN_LINKED + ) + + val DwellDuration: Set[Feature[_]] = + Set( + TimelinesSharedFeatures.DWELL_TIME_MS, + ) + + val UserFeaturesV2: Set[Feature[_]] = RecapFeaturesForAggregation ++ Set( + RecapFeatures.HAS_VINE, + RecapFeatures.HAS_PERISCOPE, + RecapFeatures.HAS_PRO_VIDEO, + RecapFeatures.HAS_VISIBLE_LINK, + RecapFeatures.BIDIRECTIONAL_FAV_COUNT, + RecapFeatures.UNIDIRECTIONAL_FAV_COUNT, + RecapFeatures.BIDIRECTIONAL_REPLY_COUNT, + RecapFeatures.UNIDIRECTIONAL_REPLY_COUNT, + RecapFeatures.BIDIRECTIONAL_RETWEET_COUNT, + RecapFeatures.UNIDIRECTIONAL_RETWEET_COUNT, + RecapFeatures.EMBEDS_URL_COUNT, + RecapFeatures.EMBEDS_IMPRESSION_COUNT, + RecapFeatures.VIDEO_VIEW_COUNT, + RecapFeatures.IS_RETWEET, + RecapFeatures.IS_REPLY, + RecapFeatures.IS_EXTENDED_REPLY, + RecapFeatures.HAS_LINK, + RecapFeatures.HAS_TREND, + RecapFeatures.LINK_LANGUAGE, + RecapFeatures.NUM_HASHTAGS, + RecapFeatures.NUM_MENTIONS, + RecapFeatures.IS_SENSITIVE, + RecapFeatures.HAS_MULTIPLE_MEDIA, + RecapFeatures.USER_REP, + RecapFeatures.FAV_COUNT_V2, + RecapFeatures.RETWEET_COUNT_V2, + RecapFeatures.REPLY_COUNT_V2, + RecapFeatures.LINK_COUNT, + EngagementDataRecordFeatures.InNetworkFavoritesCount, + EngagementDataRecordFeatures.InNetworkRetweetsCount, + EngagementDataRecordFeatures.InNetworkRepliesCount + ) + + val UserAuthorFeaturesV2: Set[Feature[_]] = Set( + RecapFeatures.HAS_IMAGE, + RecapFeatures.HAS_VINE, + RecapFeatures.HAS_PERISCOPE, + RecapFeatures.HAS_PRO_VIDEO, + RecapFeatures.HAS_VIDEO, + RecapFeatures.HAS_CARD, + RecapFeatures.HAS_NEWS, + RecapFeatures.HAS_VISIBLE_LINK, + RecapFeatures.REPLY_COUNT, + RecapFeatures.FAV_COUNT, + RecapFeatures.RETWEET_COUNT, + RecapFeatures.BLENDER_SCORE, + RecapFeatures.CONVERSATIONAL_COUNT, + RecapFeatures.IS_BUSINESS_SCORE, + RecapFeatures.CONTAINS_MEDIA, + RecapFeatures.RETWEET_SEARCHER, + RecapFeatures.REPLY_SEARCHER, + RecapFeatures.MENTION_SEARCHER, + RecapFeatures.REPLY_OTHER, + RecapFeatures.RETWEET_OTHER, + RecapFeatures.MATCH_UI_LANG, + RecapFeatures.MATCH_SEARCHER_MAIN_LANG, + RecapFeatures.MATCH_SEARCHER_LANGS, + RecapFeatures.TWEET_COUNT_FROM_USER_IN_SNAPSHOT, + RecapFeatures.TEXT_SCORE, + RecapFeatures.BIDIRECTIONAL_FAV_COUNT, + RecapFeatures.UNIDIRECTIONAL_FAV_COUNT, + RecapFeatures.BIDIRECTIONAL_REPLY_COUNT, + RecapFeatures.UNIDIRECTIONAL_REPLY_COUNT, + RecapFeatures.BIDIRECTIONAL_RETWEET_COUNT, + RecapFeatures.UNIDIRECTIONAL_RETWEET_COUNT, + RecapFeatures.EMBEDS_URL_COUNT, + RecapFeatures.EMBEDS_IMPRESSION_COUNT, + RecapFeatures.VIDEO_VIEW_COUNT, + RecapFeatures.IS_RETWEET, + RecapFeatures.IS_REPLY, + RecapFeatures.HAS_LINK, + RecapFeatures.HAS_TREND, + RecapFeatures.LINK_LANGUAGE, + RecapFeatures.NUM_HASHTAGS, + RecapFeatures.NUM_MENTIONS, + RecapFeatures.IS_SENSITIVE, + RecapFeatures.HAS_MULTIPLE_MEDIA, + RecapFeatures.FAV_COUNT_V2, + RecapFeatures.RETWEET_COUNT_V2, + RecapFeatures.REPLY_COUNT_V2, + RecapFeatures.LINK_COUNT, + EngagementDataRecordFeatures.InNetworkFavoritesCount, + EngagementDataRecordFeatures.InNetworkRetweetsCount, + EngagementDataRecordFeatures.InNetworkRepliesCount + ) + + val UserAuthorFeaturesV2Count: Set[Feature[_]] = Set( + RecapFeatures.HAS_IMAGE, + RecapFeatures.HAS_VINE, + RecapFeatures.HAS_PERISCOPE, + RecapFeatures.HAS_PRO_VIDEO, + RecapFeatures.HAS_VIDEO, + RecapFeatures.HAS_CARD, + RecapFeatures.HAS_NEWS, + RecapFeatures.HAS_VISIBLE_LINK, + RecapFeatures.FAV_COUNT, + RecapFeatures.CONTAINS_MEDIA, + RecapFeatures.RETWEET_SEARCHER, + RecapFeatures.REPLY_SEARCHER, + RecapFeatures.MENTION_SEARCHER, + RecapFeatures.REPLY_OTHER, + RecapFeatures.RETWEET_OTHER, + RecapFeatures.MATCH_UI_LANG, + RecapFeatures.MATCH_SEARCHER_MAIN_LANG, + RecapFeatures.MATCH_SEARCHER_LANGS, + RecapFeatures.IS_RETWEET, + RecapFeatures.IS_REPLY, + RecapFeatures.HAS_LINK, + RecapFeatures.HAS_TREND, + RecapFeatures.IS_SENSITIVE, + RecapFeatures.HAS_MULTIPLE_MEDIA, + EngagementDataRecordFeatures.InNetworkFavoritesCount + ) + + val UserTopicFeaturesV2Count: Set[Feature[_]] = Set( + RecapFeatures.HAS_IMAGE, + RecapFeatures.HAS_VIDEO, + RecapFeatures.HAS_CARD, + RecapFeatures.HAS_NEWS, + RecapFeatures.FAV_COUNT, + RecapFeatures.CONTAINS_MEDIA, + RecapFeatures.RETWEET_SEARCHER, + RecapFeatures.REPLY_SEARCHER, + RecapFeatures.MENTION_SEARCHER, + RecapFeatures.REPLY_OTHER, + RecapFeatures.RETWEET_OTHER, + RecapFeatures.MATCH_UI_LANG, + RecapFeatures.MATCH_SEARCHER_MAIN_LANG, + RecapFeatures.MATCH_SEARCHER_LANGS, + RecapFeatures.IS_RETWEET, + RecapFeatures.IS_REPLY, + RecapFeatures.HAS_LINK, + RecapFeatures.HAS_TREND, + RecapFeatures.IS_SENSITIVE, + EngagementDataRecordFeatures.InNetworkFavoritesCount, + EngagementDataRecordFeatures.InNetworkRetweetsCount, + TimelinesSharedFeatures.NUM_CAPS, + TimelinesSharedFeatures.ASPECT_RATIO_DEN, + TimelinesSharedFeatures.NUM_NEWLINES, + TimelinesSharedFeatures.IS_360, + TimelinesSharedFeatures.IS_MANAGED, + TimelinesSharedFeatures.IS_MONETIZABLE, + TimelinesSharedFeatures.HAS_SELECTED_PREVIEW_IMAGE, + TimelinesSharedFeatures.HAS_TITLE, + TimelinesSharedFeatures.HAS_DESCRIPTION, + TimelinesSharedFeatures.HAS_VISIT_SITE_CALL_TO_ACTION, + TimelinesSharedFeatures.HAS_WATCH_NOW_CALL_TO_ACTION + ) + + val UserFeaturesV5Continuous: Set[Feature[_]] = Set( + TimelinesSharedFeatures.QUOTE_COUNT, + TimelinesSharedFeatures.VISIBLE_TOKEN_RATIO, + TimelinesSharedFeatures.WEIGHTED_FAV_COUNT, + TimelinesSharedFeatures.WEIGHTED_RETWEET_COUNT, + TimelinesSharedFeatures.WEIGHTED_REPLY_COUNT, + TimelinesSharedFeatures.WEIGHTED_QUOTE_COUNT, + TimelinesSharedFeatures.EMBEDS_IMPRESSION_COUNT_V2, + TimelinesSharedFeatures.EMBEDS_URL_COUNT_V2, + TimelinesSharedFeatures.DECAYED_FAVORITE_COUNT, + TimelinesSharedFeatures.DECAYED_RETWEET_COUNT, + TimelinesSharedFeatures.DECAYED_REPLY_COUNT, + TimelinesSharedFeatures.DECAYED_QUOTE_COUNT, + TimelinesSharedFeatures.FAKE_FAVORITE_COUNT, + TimelinesSharedFeatures.FAKE_RETWEET_COUNT, + TimelinesSharedFeatures.FAKE_REPLY_COUNT, + TimelinesSharedFeatures.FAKE_QUOTE_COUNT, + TimeDataRecordFeatures.LAST_FAVORITE_SINCE_CREATION_HRS, + TimeDataRecordFeatures.LAST_RETWEET_SINCE_CREATION_HRS, + TimeDataRecordFeatures.LAST_REPLY_SINCE_CREATION_HRS, + TimeDataRecordFeatures.LAST_QUOTE_SINCE_CREATION_HRS, + TimeDataRecordFeatures.TIME_SINCE_LAST_FAVORITE_HRS, + TimeDataRecordFeatures.TIME_SINCE_LAST_RETWEET_HRS, + TimeDataRecordFeatures.TIME_SINCE_LAST_REPLY_HRS, + TimeDataRecordFeatures.TIME_SINCE_LAST_QUOTE_HRS + ) + + val UserFeaturesV5Boolean: Set[Feature[_]] = Set( + TimelinesSharedFeatures.LABEL_ABUSIVE_FLAG, + TimelinesSharedFeatures.LABEL_ABUSIVE_HI_RCL_FLAG, + TimelinesSharedFeatures.LABEL_DUP_CONTENT_FLAG, + TimelinesSharedFeatures.LABEL_NSFW_HI_PRC_FLAG, + TimelinesSharedFeatures.LABEL_NSFW_HI_RCL_FLAG, + TimelinesSharedFeatures.LABEL_SPAM_FLAG, + TimelinesSharedFeatures.LABEL_SPAM_HI_RCL_FLAG, + TimelinesSharedFeatures.PERISCOPE_EXISTS, + TimelinesSharedFeatures.PERISCOPE_IS_LIVE, + TimelinesSharedFeatures.PERISCOPE_HAS_BEEN_FEATURED, + TimelinesSharedFeatures.PERISCOPE_IS_CURRENTLY_FEATURED, + TimelinesSharedFeatures.PERISCOPE_IS_FROM_QUALITY_SOURCE, + TimelinesSharedFeatures.HAS_QUOTE + ) + + val UserAuthorFeaturesV5: Set[Feature[_]] = Set( + TimelinesSharedFeatures.HAS_QUOTE, + TimelinesSharedFeatures.LABEL_ABUSIVE_FLAG, + TimelinesSharedFeatures.LABEL_ABUSIVE_HI_RCL_FLAG, + TimelinesSharedFeatures.LABEL_DUP_CONTENT_FLAG, + TimelinesSharedFeatures.LABEL_NSFW_HI_PRC_FLAG, + TimelinesSharedFeatures.LABEL_NSFW_HI_RCL_FLAG, + TimelinesSharedFeatures.LABEL_SPAM_FLAG, + TimelinesSharedFeatures.LABEL_SPAM_HI_RCL_FLAG + ) + + val UserTweetSourceFeaturesV1Continuous: Set[Feature[_]] = Set( + TimelinesSharedFeatures.NUM_CAPS, + TimelinesSharedFeatures.NUM_WHITESPACES, + TimelinesSharedFeatures.TWEET_LENGTH, + TimelinesSharedFeatures.ASPECT_RATIO_DEN, + TimelinesSharedFeatures.ASPECT_RATIO_NUM, + TimelinesSharedFeatures.BIT_RATE, + TimelinesSharedFeatures.HEIGHT_1, + TimelinesSharedFeatures.HEIGHT_2, + TimelinesSharedFeatures.HEIGHT_3, + TimelinesSharedFeatures.HEIGHT_4, + TimelinesSharedFeatures.VIDEO_DURATION, + TimelinesSharedFeatures.WIDTH_1, + TimelinesSharedFeatures.WIDTH_2, + TimelinesSharedFeatures.WIDTH_3, + TimelinesSharedFeatures.WIDTH_4, + TimelinesSharedFeatures.NUM_MEDIA_TAGS + ) + + val UserTweetSourceFeaturesV1Boolean: Set[Feature[_]] = Set( + TimelinesSharedFeatures.HAS_QUESTION, + TimelinesSharedFeatures.RESIZE_METHOD_1, + TimelinesSharedFeatures.RESIZE_METHOD_2, + TimelinesSharedFeatures.RESIZE_METHOD_3, + TimelinesSharedFeatures.RESIZE_METHOD_4 + ) + + val UserTweetSourceFeaturesV2Continuous: Set[Feature[_]] = Set( + TimelinesSharedFeatures.NUM_EMOJIS, + TimelinesSharedFeatures.NUM_EMOTICONS, + TimelinesSharedFeatures.NUM_NEWLINES, + TimelinesSharedFeatures.NUM_STICKERS, + TimelinesSharedFeatures.NUM_FACES, + TimelinesSharedFeatures.NUM_COLOR_PALLETTE_ITEMS, + TimelinesSharedFeatures.VIEW_COUNT, + TimelinesSharedFeatures.TWEET_LENGTH_TYPE + ) + + val UserTweetSourceFeaturesV2Boolean: Set[Feature[_]] = Set( + TimelinesSharedFeatures.IS_360, + TimelinesSharedFeatures.IS_MANAGED, + TimelinesSharedFeatures.IS_MONETIZABLE, + TimelinesSharedFeatures.IS_EMBEDDABLE, + TimelinesSharedFeatures.HAS_SELECTED_PREVIEW_IMAGE, + TimelinesSharedFeatures.HAS_TITLE, + TimelinesSharedFeatures.HAS_DESCRIPTION, + TimelinesSharedFeatures.HAS_VISIT_SITE_CALL_TO_ACTION, + TimelinesSharedFeatures.HAS_WATCH_NOW_CALL_TO_ACTION + ) + + val UserAuthorTweetSourceFeaturesV1: Set[Feature[_]] = Set( + TimelinesSharedFeatures.HAS_QUESTION, + TimelinesSharedFeatures.TWEET_LENGTH, + TimelinesSharedFeatures.VIDEO_DURATION, + TimelinesSharedFeatures.NUM_MEDIA_TAGS + ) + + val UserAuthorTweetSourceFeaturesV2: Set[Feature[_]] = Set( + TimelinesSharedFeatures.NUM_CAPS, + TimelinesSharedFeatures.NUM_WHITESPACES, + TimelinesSharedFeatures.ASPECT_RATIO_DEN, + TimelinesSharedFeatures.ASPECT_RATIO_NUM, + TimelinesSharedFeatures.BIT_RATE, + TimelinesSharedFeatures.TWEET_LENGTH_TYPE, + TimelinesSharedFeatures.NUM_EMOJIS, + TimelinesSharedFeatures.NUM_EMOTICONS, + TimelinesSharedFeatures.NUM_NEWLINES, + TimelinesSharedFeatures.NUM_STICKERS, + TimelinesSharedFeatures.NUM_FACES, + TimelinesSharedFeatures.IS_360, + TimelinesSharedFeatures.IS_MANAGED, + TimelinesSharedFeatures.IS_MONETIZABLE, + TimelinesSharedFeatures.HAS_SELECTED_PREVIEW_IMAGE, + TimelinesSharedFeatures.HAS_TITLE, + TimelinesSharedFeatures.HAS_DESCRIPTION, + TimelinesSharedFeatures.HAS_VISIT_SITE_CALL_TO_ACTION, + TimelinesSharedFeatures.HAS_WATCH_NOW_CALL_TO_ACTION + ) + + val UserAuthorTweetSourceFeaturesV2Count: Set[Feature[_]] = Set( + TimelinesSharedFeatures.NUM_CAPS, + TimelinesSharedFeatures.ASPECT_RATIO_DEN, + TimelinesSharedFeatures.NUM_NEWLINES, + TimelinesSharedFeatures.IS_360, + TimelinesSharedFeatures.IS_MANAGED, + TimelinesSharedFeatures.IS_MONETIZABLE, + TimelinesSharedFeatures.HAS_SELECTED_PREVIEW_IMAGE, + TimelinesSharedFeatures.HAS_TITLE, + TimelinesSharedFeatures.HAS_DESCRIPTION, + TimelinesSharedFeatures.HAS_VISIT_SITE_CALL_TO_ACTION, + TimelinesSharedFeatures.HAS_WATCH_NOW_CALL_TO_ACTION + ) + + val LabelsV2: Set[Feature.Binary] = RecapLabelsForAggregation ++ Set( + RecapFeatures.IS_REPLIED, + RecapFeatures.IS_PHOTO_EXPANDED, + RecapFeatures.IS_VIDEO_PLAYBACK_50 + ) + + val TwitterWideFeatures: Set[Feature[_]] = Set( + RecapFeatures.IS_REPLY, + TimelinesSharedFeatures.HAS_QUOTE, + RecapFeatures.HAS_MENTION, + RecapFeatures.HAS_HASHTAG, + RecapFeatures.HAS_LINK, + RecapFeatures.HAS_CARD, + RecapFeatures.CONTAINS_MEDIA + ) + + val TwitterWideLabels: Set[Feature.Binary] = Set( + RecapFeatures.IS_FAVORITED, + RecapFeatures.IS_RETWEETED, + RecapFeatures.IS_REPLIED + ) + + val ReciprocalLabels: Set[Feature.Binary] = Set( + RecapFeatures.IS_REPLIED_REPLY_IMPRESSED_BY_AUTHOR, + RecapFeatures.IS_REPLIED_REPLY_REPLIED_BY_AUTHOR, + RecapFeatures.IS_REPLIED_REPLY_FAVORITED_BY_AUTHOR + ) + + val NegativeEngagementLabels: Set[Feature.Binary] = Set( + RecapFeatures.IS_REPORT_TWEET_CLICKED, + RecapFeatures.IS_BLOCK_CLICKED, + RecapFeatures.IS_MUTE_CLICKED, + RecapFeatures.IS_DONT_LIKE + ) + + val GoodClickLabels: Set[Feature.Binary] = Set( + RecapFeatures.IS_GOOD_CLICKED_CONVO_DESC_V1, + RecapFeatures.IS_GOOD_CLICKED_CONVO_DESC_V2, + ) +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/RectweetUserFeatureAggregation.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/RectweetUserFeatureAggregation.scala new file mode 100644 index 0000000000..12835ef1f7 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/RectweetUserFeatureAggregation.scala @@ -0,0 +1,52 @@ +package com.twitter.timelines.prediction.common.aggregates + +import com.twitter.ml.api.Feature +import com.twitter.timelines.prediction.features.engagement_features.EngagementDataRecordFeatures +import com.twitter.timelines.prediction.features.itl.ITLFeatures + +object RectweetUserFeatureAggregation { + val RectweetLabelsForAggregation: Set[Feature.Binary] = + Set( + ITLFeatures.IS_FAVORITED, + ITLFeatures.IS_RETWEETED, + ITLFeatures.IS_REPLIED, + ITLFeatures.IS_CLICKED, + ITLFeatures.IS_PROFILE_CLICKED, + ITLFeatures.IS_OPEN_LINKED, + ITLFeatures.IS_PHOTO_EXPANDED, + ITLFeatures.IS_VIDEO_PLAYBACK_50 + ) + + val TweetFeatures: Set[Feature[_]] = Set( + ITLFeatures.HAS_IMAGE, + ITLFeatures.HAS_CARD, + ITLFeatures.HAS_NEWS, + ITLFeatures.REPLY_COUNT, + ITLFeatures.FAV_COUNT, + ITLFeatures.REPLY_COUNT, + ITLFeatures.RETWEET_COUNT, + ITLFeatures.MATCHES_UI_LANG, + ITLFeatures.MATCHES_SEARCHER_MAIN_LANG, + ITLFeatures.MATCHES_SEARCHER_LANGS, + ITLFeatures.TEXT_SCORE, + ITLFeatures.LINK_LANGUAGE, + ITLFeatures.NUM_HASHTAGS, + ITLFeatures.NUM_MENTIONS, + ITLFeatures.IS_SENSITIVE, + ITLFeatures.HAS_VIDEO, + ITLFeatures.HAS_LINK, + ITLFeatures.HAS_VISIBLE_LINK, + EngagementDataRecordFeatures.InNetworkFavoritesCount + // nice to have, but currently not hydrated in the RecommendedTweet payload + //EngagementDataRecordFeatures.InNetworkRetweetsCount, + //EngagementDataRecordFeatures.InNetworkRepliesCount + ) + + val ReciprocalLabels: Set[Feature.Binary] = Set( + ITLFeatures.IS_REPLIED_REPLY_IMPRESSED_BY_AUTHOR, + ITLFeatures.IS_REPLIED_REPLY_REPLIED_BY_AUTHOR, + ITLFeatures.IS_REPLIED_REPLY_FAVORITED_BY_AUTHOR, + ITLFeatures.IS_REPLIED_REPLY_RETWEETED_BY_AUTHOR, + ITLFeatures.IS_REPLIED_REPLY_QUOTED_BY_AUTHOR + ) +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfig.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfig.scala new file mode 100644 index 0000000000..e6581e32e2 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfig.scala @@ -0,0 +1,80 @@ +package com.twitter.timelines.prediction.common.aggregates + +import com.twitter.dal.client.dataset.KeyValDALDataset +import com.twitter.ml.api.DataRecord +import com.twitter.ml.api.FeatureContext +import com.twitter.scalding_internal.multiformat.format.keyval +import com.twitter.summingbird.batch.BatchID +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion.CombineCountsPolicy +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregateStore +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.OfflineAggregateDataRecordStore +import scala.collection.JavaConverters._ + +object TimelinesAggregationConfig extends TimelinesAggregationConfigTrait { + override def outputHdfsPath: String = "/user/timelines/processed/aggregates_v2" + + def storeToDatasetMap: Map[String, KeyValDALDataset[ + keyval.KeyVal[AggregationKey, (BatchID, DataRecord)] + ]] = Map( + AuthorTopicAggregateStore -> AuthorTopicAggregatesScalaDataset, + UserTopicAggregateStore -> UserTopicAggregatesScalaDataset, + UserInferredTopicAggregateStore -> UserInferredTopicAggregatesScalaDataset, + UserAggregateStore -> UserAggregatesScalaDataset, + UserAuthorAggregateStore -> UserAuthorAggregatesScalaDataset, + UserOriginalAuthorAggregateStore -> UserOriginalAuthorAggregatesScalaDataset, + OriginalAuthorAggregateStore -> OriginalAuthorAggregatesScalaDataset, + UserEngagerAggregateStore -> UserEngagerAggregatesScalaDataset, + UserMentionAggregateStore -> UserMentionAggregatesScalaDataset, + TwitterWideUserAggregateStore -> TwitterWideUserAggregatesScalaDataset, + TwitterWideUserAuthorAggregateStore -> TwitterWideUserAuthorAggregatesScalaDataset, + UserRequestHourAggregateStore -> UserRequestHourAggregatesScalaDataset, + UserRequestDowAggregateStore -> UserRequestDowAggregatesScalaDataset, + UserListAggregateStore -> UserListAggregatesScalaDataset, + UserMediaUnderstandingAnnotationAggregateStore -> UserMediaUnderstandingAnnotationAggregatesScalaDataset, + ) + + override def mkPhysicalStore(store: AggregateStore): AggregateStore = store match { + case s: OfflineAggregateDataRecordStore => + s.toOfflineAggregateDataRecordStoreWithDAL(storeToDatasetMap(s.name)) + case _ => throw new IllegalArgumentException("Unsupported logical dataset type.") + } + + object CombineCountPolicies { + val EngagerCountsPolicy: CombineCountsPolicy = mkCountsPolicy("user_engager_aggregate") + val EngagerGoodClickCountsPolicy: CombineCountsPolicy = mkCountsPolicy( + "user_engager_good_click_aggregate") + val RectweetEngagerCountsPolicy: CombineCountsPolicy = + mkCountsPolicy("rectweet_user_engager_aggregate") + val MentionCountsPolicy: CombineCountsPolicy = mkCountsPolicy("user_mention_aggregate") + val RectweetSimclustersTweetCountsPolicy: CombineCountsPolicy = + mkCountsPolicy("rectweet_user_simcluster_tweet_aggregate") + val UserInferredTopicCountsPolicy: CombineCountsPolicy = + mkCountsPolicy("user_inferred_topic_aggregate") + val UserInferredTopicV2CountsPolicy: CombineCountsPolicy = + mkCountsPolicy("user_inferred_topic_aggregate_v2") + val UserMediaUnderstandingAnnotationCountsPolicy: CombineCountsPolicy = + mkCountsPolicy("user_media_annotation_aggregate") + + private[this] def mkCountsPolicy(prefix: String): CombineCountsPolicy = { + val features = TimelinesAggregationConfig.aggregatesToCompute + .filter(_.aggregatePrefix == prefix) + .flatMap(_.allOutputFeatures) + CombineCountsPolicy( + topK = 2, + aggregateContextToPrecompute = new FeatureContext(features.asJava), + hardLimit = Some(20) + ) + } + } +} + +object TimelinesAggregationCanaryConfig extends TimelinesAggregationConfigTrait { + override def outputHdfsPath: String = "/user/timelines/canaries/processed/aggregates_v2" + + override def mkPhysicalStore(store: AggregateStore): AggregateStore = store match { + case s: OfflineAggregateDataRecordStore => + s.toOfflineAggregateDataRecordStoreWithDAL(dalDataset = AggregatesCanaryScalaDataset) + case _ => throw new IllegalArgumentException("Unsupported logical dataset type.") + } +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigDetails.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigDetails.scala new file mode 100644 index 0000000000..aa439dedab --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigDetails.scala @@ -0,0 +1,579 @@ +package com.twitter.timelines.prediction.common.aggregates + +import com.twitter.conversions.DurationOps._ +import com.twitter.ml.api.constant.SharedFeatures.AUTHOR_ID +import com.twitter.ml.api.constant.SharedFeatures.USER_ID +import com.twitter.timelines.data_processing.ml_util.aggregation_framework._ +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics._ +import com.twitter.timelines.data_processing.ml_util.transforms.DownsampleTransform +import com.twitter.timelines.data_processing.ml_util.transforms.RichRemoveAuthorIdZero +import com.twitter.timelines.data_processing.ml_util.transforms.RichRemoveUserIdZero +import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures +import com.twitter.timelines.prediction.features.engagement_features.EngagementDataRecordFeatures +import com.twitter.timelines.prediction.features.engagement_features.EngagementDataRecordFeatures.RichUnifyPublicEngagersTransform +import com.twitter.timelines.prediction.features.list_features.ListFeatures +import com.twitter.timelines.prediction.features.recap.RecapFeatures +import com.twitter.timelines.prediction.features.request_context.RequestContextFeatures +import com.twitter.timelines.prediction.features.semantic_core_features.SemanticCoreFeatures +import com.twitter.timelines.prediction.transform.filter.FilterInNetworkTransform +import com.twitter.timelines.prediction.transform.filter.FilterImageTweetTransform +import com.twitter.timelines.prediction.transform.filter.FilterVideoTweetTransform +import com.twitter.timelines.prediction.transform.filter.FilterOutImageVideoTweetTransform +import com.twitter.util.Duration + +trait TimelinesAggregationConfigDetails extends Serializable { + + import TimelinesAggregationSources._ + + def outputHdfsPath: String + + /** + * Converts the given logical store to a physical store. The reason we do not specify the + * physical store directly with the [[AggregateGroup]] is because of a cyclic dependency when + * create physical stores that are DalDataset with PersonalDataType annotations derived from + * the [[AggregateGroup]]. + * + */ + def mkPhysicalStore(store: AggregateStore): AggregateStore + + def defaultMaxKvSourceFailures: Int = 100 + + val timelinesOfflineAggregateSink = new OfflineStoreCommonConfig { + override def apply(startDate: String) = OfflineAggregateStoreCommonConfig( + outputHdfsPathPrefix = outputHdfsPath, + dummyAppId = "timelines_aggregates_v2_ro", + dummyDatasetPrefix = "timelines_aggregates_v2_ro", + startDate = startDate + ) + } + + val UserAggregateStore = "user_aggregates" + val UserAuthorAggregateStore = "user_author_aggregates" + val UserOriginalAuthorAggregateStore = "user_original_author_aggregates" + val OriginalAuthorAggregateStore = "original_author_aggregates" + val UserEngagerAggregateStore = "user_engager_aggregates" + val UserMentionAggregateStore = "user_mention_aggregates" + val TwitterWideUserAggregateStore = "twitter_wide_user_aggregates" + val TwitterWideUserAuthorAggregateStore = "twitter_wide_user_author_aggregates" + val UserRequestHourAggregateStore = "user_request_hour_aggregates" + val UserRequestDowAggregateStore = "user_request_dow_aggregates" + val UserListAggregateStore = "user_list_aggregates" + val AuthorTopicAggregateStore = "author_topic_aggregates" + val UserTopicAggregateStore = "user_topic_aggregates" + val UserInferredTopicAggregateStore = "user_inferred_topic_aggregates" + val UserMediaUnderstandingAnnotationAggregateStore = + "user_media_understanding_annotation_aggregates" + val AuthorCountryCodeAggregateStore = "author_country_code_aggregates" + val OriginalAuthorCountryCodeAggregateStore = "original_author_country_code_aggregates" + + /** + * Step 3: Configure all aggregates to compute. + * Note that different subsets of aggregates in this list + * can be launched by different summingbird job instances. + * Any given job can be responsible for a set of AggregateGroup + * configs whose outputStores share the same exact startDate. + * AggregateGroups that do not share the same inputSource, + * outputStore or startDate MUST be launched using different + * summingbird jobs and passed in a different --start-time argument + * See science/scalding/mesos/timelines/prod.yaml for an example + * of how to configure your own job. + */ + val negativeDownsampleTransform = + DownsampleTransform( + negativeSamplingRate = 0.03, + keepLabels = RecapUserFeatureAggregation.LabelsV2) + val negativeRecTweetDownsampleTransform = DownsampleTransform( + negativeSamplingRate = 0.03, + keepLabels = RectweetUserFeatureAggregation.RectweetLabelsForAggregation + ) + + val userAggregatesV2: AggregateGroup = + AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "user_aggregate_v2", + preTransforms = Seq(RichRemoveUserIdZero), /* Eliminates reducer skew */ + keys = Set(USER_ID), + features = RecapUserFeatureAggregation.UserFeaturesV2, + labels = RecapUserFeatureAggregation.LabelsV2, + metrics = Set(CountMetric, SumMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = UserAggregateStore, + startDate = "2016-07-15 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )) + ) + + val userAuthorAggregatesV2: Set[AggregateGroup] = { + + /** + * NOTE: We need to remove records from out-of-network authors from the recap input + * records (which now include out-of-network records as well after merging recap and + * rectweet models) that are used to compute user-author aggregates. This is necessary + * to limit the growth rate of user-author aggregates. + */ + val allFeatureAggregates = Set( + AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "user_author_aggregate_v2", + preTransforms = Seq(FilterInNetworkTransform, RichRemoveUserIdZero), + keys = Set(USER_ID, AUTHOR_ID), + features = RecapUserFeatureAggregation.UserAuthorFeaturesV2, + labels = RecapUserFeatureAggregation.LabelsV2, + metrics = Set(SumMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = UserAuthorAggregateStore, + startDate = "2016-07-15 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )) + ) + ) + + val countAggregates: Set[AggregateGroup] = Set( + AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "user_author_aggregate_v2", + preTransforms = Seq(FilterInNetworkTransform, RichRemoveUserIdZero), + keys = Set(USER_ID, AUTHOR_ID), + features = RecapUserFeatureAggregation.UserAuthorFeaturesV2Count, + labels = RecapUserFeatureAggregation.LabelsV2, + metrics = Set(CountMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = UserAuthorAggregateStore, + startDate = "2016-07-15 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )) + ) + ) + + allFeatureAggregates ++ countAggregates + } + + val userAggregatesV5Continuous: AggregateGroup = + AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "user_aggregate_v5.continuous", + preTransforms = Seq(RichRemoveUserIdZero), + keys = Set(USER_ID), + features = RecapUserFeatureAggregation.UserFeaturesV5Continuous, + labels = RecapUserFeatureAggregation.LabelsV2, + metrics = Set(CountMetric, SumMetric, SumSqMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = UserAggregateStore, + startDate = "2016-07-15 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )) + ) + + val userAuthorAggregatesV5: AggregateGroup = + AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "user_author_aggregate_v5", + preTransforms = Seq(FilterInNetworkTransform, RichRemoveUserIdZero), + keys = Set(USER_ID, AUTHOR_ID), + features = RecapUserFeatureAggregation.UserAuthorFeaturesV5, + labels = RecapUserFeatureAggregation.LabelsV2, + metrics = Set(CountMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = UserAuthorAggregateStore, + startDate = "2016-07-15 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )) + ) + + val tweetSourceUserAuthorAggregatesV1: AggregateGroup = + AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "user_author_aggregate_tweetsource_v1", + preTransforms = Seq(FilterInNetworkTransform, RichRemoveUserIdZero), + keys = Set(USER_ID, AUTHOR_ID), + features = RecapUserFeatureAggregation.UserAuthorTweetSourceFeaturesV1, + labels = RecapUserFeatureAggregation.LabelsV2, + metrics = Set(CountMetric, SumMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = UserAuthorAggregateStore, + startDate = "2016-07-15 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )) + ) + + val userEngagerAggregates = AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "user_engager_aggregate", + keys = Set(USER_ID, EngagementDataRecordFeatures.PublicEngagementUserIds), + features = Set.empty, + labels = RecapUserFeatureAggregation.LabelsV2, + metrics = Set(CountMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = UserEngagerAggregateStore, + startDate = "2016-09-02 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )), + preTransforms = Seq( + RichRemoveUserIdZero, + RichUnifyPublicEngagersTransform + ) + ) + + val userMentionAggregates = AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + preTransforms = Seq(RichRemoveUserIdZero), /* Eliminates reducer skew */ + aggregatePrefix = "user_mention_aggregate", + keys = Set(USER_ID, RecapFeatures.MENTIONED_SCREEN_NAMES), + features = Set.empty, + labels = RecapUserFeatureAggregation.LabelsV2, + metrics = Set(CountMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = UserMentionAggregateStore, + startDate = "2017-03-01 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )), + includeAnyLabel = false + ) + + val twitterWideUserAggregates = AggregateGroup( + inputSource = timelinesDailyTwitterWideSource, + preTransforms = Seq(RichRemoveUserIdZero), /* Eliminates reducer skew */ + aggregatePrefix = "twitter_wide_user_aggregate", + keys = Set(USER_ID), + features = RecapUserFeatureAggregation.TwitterWideFeatures, + labels = RecapUserFeatureAggregation.TwitterWideLabels, + metrics = Set(CountMetric, SumMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = TwitterWideUserAggregateStore, + startDate = "2016-12-28 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )) + ) + + val twitterWideUserAuthorAggregates = AggregateGroup( + inputSource = timelinesDailyTwitterWideSource, + preTransforms = Seq(RichRemoveUserIdZero), /* Eliminates reducer skew */ + aggregatePrefix = "twitter_wide_user_author_aggregate", + keys = Set(USER_ID, AUTHOR_ID), + features = RecapUserFeatureAggregation.TwitterWideFeatures, + labels = RecapUserFeatureAggregation.TwitterWideLabels, + metrics = Set(CountMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = TwitterWideUserAuthorAggregateStore, + startDate = "2016-12-28 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )), + includeAnyLabel = false + ) + + /** + * User-HourOfDay and User-DayOfWeek aggregations, both for recap and rectweet + */ + val userRequestHourAggregates = AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "user_request_context_aggregate.hour", + preTransforms = Seq(RichRemoveUserIdZero, negativeDownsampleTransform), + keys = Set(USER_ID, RequestContextFeatures.TIMESTAMP_GMT_HOUR), + features = Set.empty, + labels = RecapUserFeatureAggregation.LabelsV2, + metrics = Set(CountMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = UserRequestHourAggregateStore, + startDate = "2017-08-01 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )) + ) + + val userRequestDowAggregates = AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "user_request_context_aggregate.dow", + preTransforms = Seq(RichRemoveUserIdZero, negativeDownsampleTransform), + keys = Set(USER_ID, RequestContextFeatures.TIMESTAMP_GMT_DOW), + features = Set.empty, + labels = RecapUserFeatureAggregation.LabelsV2, + metrics = Set(CountMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = UserRequestDowAggregateStore, + startDate = "2017-08-01 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )) + ) + + val authorTopicAggregates = AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "author_topic_aggregate", + preTransforms = Seq(RichRemoveUserIdZero), + keys = Set(AUTHOR_ID, TimelinesSharedFeatures.TOPIC_ID), + features = Set.empty, + labels = RecapUserFeatureAggregation.LabelsV2, + metrics = Set(CountMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = AuthorTopicAggregateStore, + startDate = "2020-05-19 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )) + ) + + val userTopicAggregates = AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "user_topic_aggregate", + preTransforms = Seq(RichRemoveUserIdZero), + keys = Set(USER_ID, TimelinesSharedFeatures.TOPIC_ID), + features = Set.empty, + labels = RecapUserFeatureAggregation.LabelsV2, + metrics = Set(CountMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = UserTopicAggregateStore, + startDate = "2020-05-23 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )) + ) + + val userTopicAggregatesV2 = AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "user_topic_aggregate_v2", + preTransforms = Seq(RichRemoveUserIdZero), + keys = Set(USER_ID, TimelinesSharedFeatures.TOPIC_ID), + features = RecapUserFeatureAggregation.UserTopicFeaturesV2Count, + labels = RecapUserFeatureAggregation.LabelsV2, + includeAnyFeature = false, + includeAnyLabel = false, + metrics = Set(CountMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = UserTopicAggregateStore, + startDate = "2020-05-23 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )) + ) + + val userInferredTopicAggregates = AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "user_inferred_topic_aggregate", + preTransforms = Seq(RichRemoveUserIdZero), + keys = Set(USER_ID, TimelinesSharedFeatures.INFERRED_TOPIC_IDS), + features = Set.empty, + labels = RecapUserFeatureAggregation.LabelsV2, + metrics = Set(CountMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = UserInferredTopicAggregateStore, + startDate = "2020-09-09 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )) + ) + + val userInferredTopicAggregatesV2 = AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "user_inferred_topic_aggregate_v2", + preTransforms = Seq(RichRemoveUserIdZero), + keys = Set(USER_ID, TimelinesSharedFeatures.INFERRED_TOPIC_IDS), + features = RecapUserFeatureAggregation.UserTopicFeaturesV2Count, + labels = RecapUserFeatureAggregation.LabelsV2, + includeAnyFeature = false, + includeAnyLabel = false, + metrics = Set(CountMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = UserInferredTopicAggregateStore, + startDate = "2020-09-09 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )) + ) + + val userReciprocalEngagementAggregates = AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "user_aggregate_v6", + preTransforms = Seq(RichRemoveUserIdZero), + keys = Set(USER_ID), + features = Set.empty, + labels = RecapUserFeatureAggregation.ReciprocalLabels, + metrics = Set(CountMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = UserAggregateStore, + startDate = "2016-07-15 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )), + includeAnyLabel = false + ) + + val userOriginalAuthorReciprocalEngagementAggregates = AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "user_original_author_aggregate_v1", + preTransforms = Seq(RichRemoveUserIdZero, RichRemoveAuthorIdZero), + keys = Set(USER_ID, TimelinesSharedFeatures.ORIGINAL_AUTHOR_ID), + features = Set.empty, + labels = RecapUserFeatureAggregation.ReciprocalLabels, + metrics = Set(CountMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = UserOriginalAuthorAggregateStore, + startDate = "2018-12-26 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )), + includeAnyLabel = false + ) + + val originalAuthorReciprocalEngagementAggregates = AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "original_author_aggregate_v1", + preTransforms = Seq(RichRemoveUserIdZero, RichRemoveAuthorIdZero), + keys = Set(TimelinesSharedFeatures.ORIGINAL_AUTHOR_ID), + features = Set.empty, + labels = RecapUserFeatureAggregation.ReciprocalLabels, + metrics = Set(CountMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = OriginalAuthorAggregateStore, + startDate = "2023-02-25 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )), + includeAnyLabel = false + ) + + val originalAuthorNegativeEngagementAggregates = AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "original_author_aggregate_v2", + preTransforms = Seq(RichRemoveUserIdZero, RichRemoveAuthorIdZero), + keys = Set(TimelinesSharedFeatures.ORIGINAL_AUTHOR_ID), + features = Set.empty, + labels = RecapUserFeatureAggregation.NegativeEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = OriginalAuthorAggregateStore, + startDate = "2023-02-25 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )), + includeAnyLabel = false + ) + + val userListAggregates: AggregateGroup = + AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "user_list_aggregate", + keys = Set(USER_ID, ListFeatures.LIST_ID), + features = Set.empty, + labels = RecapUserFeatureAggregation.LabelsV2, + metrics = Set(CountMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = UserListAggregateStore, + startDate = "2020-05-28 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )), + preTransforms = Seq(RichRemoveUserIdZero) + ) + + val userMediaUnderstandingAnnotationAggregates: AggregateGroup = AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "user_media_annotation_aggregate", + preTransforms = Seq(RichRemoveUserIdZero), + keys = + Set(USER_ID, SemanticCoreFeatures.mediaUnderstandingHighRecallNonSensitiveEntityIdsFeature), + features = Set.empty, + labels = RecapUserFeatureAggregation.LabelsV2, + metrics = Set(CountMetric), + halfLives = Set(50.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = UserMediaUnderstandingAnnotationAggregateStore, + startDate = "2021-03-20 00:00", + commonConfig = timelinesOfflineAggregateSink + )) + ) + + val userAuthorGoodClickAggregates = AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "user_author_good_click_aggregate", + preTransforms = Seq(FilterInNetworkTransform, RichRemoveUserIdZero), + keys = Set(USER_ID, AUTHOR_ID), + features = RecapUserFeatureAggregation.UserAuthorFeaturesV2, + labels = RecapUserFeatureAggregation.GoodClickLabels, + metrics = Set(SumMetric), + halfLives = Set(14.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = UserAuthorAggregateStore, + startDate = "2016-07-15 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )) + ) + + val userEngagerGoodClickAggregates = AggregateGroup( + inputSource = timelinesDailyRecapMinimalSource, + aggregatePrefix = "user_engager_good_click_aggregate", + keys = Set(USER_ID, EngagementDataRecordFeatures.PublicEngagementUserIds), + features = Set.empty, + labels = RecapUserFeatureAggregation.GoodClickLabels, + metrics = Set(CountMetric), + halfLives = Set(14.days), + outputStore = mkPhysicalStore( + OfflineAggregateDataRecordStore( + name = UserEngagerAggregateStore, + startDate = "2016-09-02 00:00", + commonConfig = timelinesOfflineAggregateSink, + maxKvSourceFailures = defaultMaxKvSourceFailures + )), + preTransforms = Seq( + RichRemoveUserIdZero, + RichUnifyPublicEngagersTransform + ) + ) + +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigTrait.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigTrait.scala new file mode 100644 index 0000000000..6fb2e07b73 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigTrait.scala @@ -0,0 +1,50 @@ +package com.twitter.timelines.prediction.common.aggregates + +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationConfig +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregateGroup +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup + +trait TimelinesAggregationConfigTrait + extends TimelinesAggregationConfigDetails + with AggregationConfig { + private val aggregateGroups = Set( + authorTopicAggregates, + userTopicAggregates, + userTopicAggregatesV2, + userInferredTopicAggregates, + userInferredTopicAggregatesV2, + userAggregatesV2, + userAggregatesV5Continuous, + userReciprocalEngagementAggregates, + userAuthorAggregatesV5, + userOriginalAuthorReciprocalEngagementAggregates, + originalAuthorReciprocalEngagementAggregates, + tweetSourceUserAuthorAggregatesV1, + userEngagerAggregates, + userMentionAggregates, + twitterWideUserAggregates, + twitterWideUserAuthorAggregates, + userRequestHourAggregates, + userRequestDowAggregates, + userListAggregates, + userMediaUnderstandingAnnotationAggregates, + ) ++ userAuthorAggregatesV2 + + val aggregatesToComputeList: Set[List[TypedAggregateGroup[_]]] = + aggregateGroups.map(_.buildTypedAggregateGroups()) + + override val aggregatesToCompute: Set[TypedAggregateGroup[_]] = aggregatesToComputeList.flatten + + /* + * Feature selection config to save storage space and manhattan query bandwidth. + * Only the most important features found using offline RCE simulations are used + * when actually training and serving. This selector is used by + * [[com.twitter.timelines.data_processing.jobs.timeline_ranking_user_features.TimelineRankingAggregatesV2FeaturesProdJob]] + * but defined here to keep it in sync with the config that computes the aggregates. + */ + val AggregatesV2FeatureSelector = FeatureSelectorConfig.AggregatesV2ProdFeatureSelector + + def filterAggregatesGroups(storeNames: Set[String]): Set[AggregateGroup] = { + aggregateGroups.filter(aggregateGroup => storeNames.contains(aggregateGroup.outputStore.name)) + } +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationKeyValInjections.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationKeyValInjections.scala new file mode 100644 index 0000000000..1f2433b53e --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationKeyValInjections.scala @@ -0,0 +1,48 @@ +package com.twitter.timelines.prediction.common.aggregates + +import com.twitter.ml.api.DataRecord +import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection +import com.twitter.summingbird.batch.BatchID +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.{ + AggregateStore, + AggregationKey, + OfflineAggregateInjections, + TypedAggregateGroup +} + +object TimelinesAggregationKeyValInjections extends TimelinesAggregationConfigTrait { + + import OfflineAggregateInjections.getInjection + + type KVInjection = KeyValInjection[AggregationKey, (BatchID, DataRecord)] + + val AuthorTopic: KVInjection = getInjection(filter(AuthorTopicAggregateStore)) + val UserTopic: KVInjection = getInjection(filter(UserTopicAggregateStore)) + val UserInferredTopic: KVInjection = getInjection(filter(UserInferredTopicAggregateStore)) + val User: KVInjection = getInjection(filter(UserAggregateStore)) + val UserAuthor: KVInjection = getInjection(filter(UserAuthorAggregateStore)) + val UserOriginalAuthor: KVInjection = getInjection(filter(UserOriginalAuthorAggregateStore)) + val OriginalAuthor: KVInjection = getInjection(filter(OriginalAuthorAggregateStore)) + val UserEngager: KVInjection = getInjection(filter(UserEngagerAggregateStore)) + val UserMention: KVInjection = getInjection(filter(UserMentionAggregateStore)) + val TwitterWideUser: KVInjection = getInjection(filter(TwitterWideUserAggregateStore)) + val TwitterWideUserAuthor: KVInjection = getInjection(filter(TwitterWideUserAuthorAggregateStore)) + val UserRequestHour: KVInjection = getInjection(filter(UserRequestHourAggregateStore)) + val UserRequestDow: KVInjection = getInjection(filter(UserRequestDowAggregateStore)) + val UserList: KVInjection = getInjection(filter(UserListAggregateStore)) + val UserMediaUnderstandingAnnotation: KVInjection = getInjection( + filter(UserMediaUnderstandingAnnotationAggregateStore)) + + private def filter(storeName: String): Set[TypedAggregateGroup[_]] = { + val groups = aggregatesToCompute.filter(_.outputStore.name == storeName) + require(groups.nonEmpty) + groups + } + + override def outputHdfsPath: String = "/user/timelines/processed/aggregates_v2" + + // Since this object is not used to execute any online or offline aggregates job, but is meant + // to store all PDT enabled KeyValInjections, we do not need to construct a physical store. + // We use the identity operation as a default. + override def mkPhysicalStore(store: AggregateStore): AggregateStore = store +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationSources.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationSources.scala new file mode 100644 index 0000000000..c799f22fa7 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationSources.scala @@ -0,0 +1,45 @@ +package com.twitter.timelines.prediction.common.aggregates + +import com.twitter.ml.api.constant.SharedFeatures.TIMESTAMP +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.OfflineAggregateSource +import com.twitter.timelines.prediction.features.p_home_latest.HomeLatestUserAggregatesFeatures +import timelines.data_processing.ad_hoc.recap.data_record_preparation.RecapDataRecordsAggMinimalJavaDataset + +/** + * Any update here should be in sync with [[TimelinesFeatureGroups]] and [[AggMinimalDataRecordGeneratorJob]]. + */ +object TimelinesAggregationSources { + + /** + * This is the recap data records after post-processing in [[GenerateRecapAggMinimalDataRecordsJob]] + */ + val timelinesDailyRecapMinimalSource = OfflineAggregateSource( + name = "timelines_daily_recap", + timestampFeature = TIMESTAMP, + dalDataSet = Some(RecapDataRecordsAggMinimalJavaDataset), + scaldingSuffixType = Some("dal"), + withValidation = true + ) + val timelinesDailyTwitterWideSource = OfflineAggregateSource( + name = "timelines_daily_twitter_wide", + timestampFeature = TIMESTAMP, + scaldingHdfsPath = Some("/user/timelines/processed/suggests/recap/twitter_wide_data_records"), + scaldingSuffixType = Some("daily"), + withValidation = true + ) + + val timelinesDailyListTimelineSource = OfflineAggregateSource( + name = "timelines_daily_list_timeline", + timestampFeature = TIMESTAMP, + scaldingHdfsPath = Some("/user/timelines/processed/suggests/recap/all_features/list"), + scaldingSuffixType = Some("hourly"), + withValidation = true + ) + + val timelinesDailyHomeLatestSource = OfflineAggregateSource( + name = "timelines_daily_home_latest", + timestampFeature = HomeLatestUserAggregatesFeatures.AGGREGATE_TIMESTAMP_MS, + scaldingHdfsPath = Some("/user/timelines/processed/p_home_latest/user_aggregates"), + scaldingSuffixType = Some("daily") + ) +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/AuthorFeaturesAdapter.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/AuthorFeaturesAdapter.scala new file mode 100644 index 0000000000..7cefc67b9e --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/AuthorFeaturesAdapter.scala @@ -0,0 +1,70 @@ +package com.twitter.timelines.prediction.common.aggregates.real_time + +import com.twitter.dal.personal_data.thriftjava.PersonalDataType.UserState +import com.twitter.ml.api.Feature.Binary +import com.twitter.ml.api.{DataRecord, Feature, FeatureContext, RichDataRecord} +import com.twitter.ml.featurestore.catalog.entities.core.Author +import com.twitter.ml.featurestore.catalog.features.magicrecs.UserActivity +import com.twitter.ml.featurestore.lib.data.PredictionRecord +import com.twitter.ml.featurestore.lib.feature.{BoundFeature, BoundFeatureSet} +import com.twitter.ml.featurestore.lib.{UserId, Discrete => FSDiscrete} +import com.twitter.timelines.prediction.common.adapters.TimelinesAdapterBase +import java.lang.{Boolean => JBoolean} +import java.util +import scala.collection.JavaConverters._ + +object AuthorFeaturesAdapter extends TimelinesAdapterBase[PredictionRecord] { + val UserStateBoundFeature: BoundFeature[UserId, FSDiscrete] = UserActivity.UserState.bind(Author) + val UserFeaturesSet: BoundFeatureSet = BoundFeatureSet(UserStateBoundFeature) + + /** + * Boolean features about viewer's user state. + * enum UserState { + * NEW = 0, + * NEAR_ZERO = 1, + * VERY_LIGHT = 2, + * LIGHT = 3, + * MEDIUM_TWEETER = 4, + * MEDIUM_NON_TWEETER = 5, + * HEAVY_NON_TWEETER = 6, + * HEAVY_TWEETER = 7 + * }(persisted='true') + */ + val IS_USER_NEW = new Binary("timelines.author.user_state.is_user_new", Set(UserState).asJava) + val IS_USER_LIGHT = new Binary("timelines.author.user_state.is_user_light", Set(UserState).asJava) + val IS_USER_MEDIUM_TWEETER = + new Binary("timelines.author.user_state.is_user_medium_tweeter", Set(UserState).asJava) + val IS_USER_MEDIUM_NON_TWEETER = + new Binary("timelines.author.user_state.is_user_medium_non_tweeter", Set(UserState).asJava) + val IS_USER_HEAVY_NON_TWEETER = + new Binary("timelines.author.user_state.is_user_heavy_non_tweeter", Set(UserState).asJava) + val IS_USER_HEAVY_TWEETER = + new Binary("timelines.author.user_state.is_user_heavy_tweeter", Set(UserState).asJava) + val userStateToFeatureMap: Map[Long, Binary] = Map( + 0L -> IS_USER_NEW, + 1L -> IS_USER_LIGHT, + 2L -> IS_USER_LIGHT, + 3L -> IS_USER_LIGHT, + 4L -> IS_USER_MEDIUM_TWEETER, + 5L -> IS_USER_MEDIUM_NON_TWEETER, + 6L -> IS_USER_HEAVY_NON_TWEETER, + 7L -> IS_USER_HEAVY_TWEETER + ) + + val UserStateBooleanFeatures: Set[Feature[_]] = userStateToFeatureMap.values.toSet + + private val allFeatures: Seq[Feature[_]] = UserStateBooleanFeatures.toSeq + override def getFeatureContext: FeatureContext = new FeatureContext(allFeatures: _*) + override def commonFeatures: Set[Feature[_]] = Set.empty + + override def adaptToDataRecords(record: PredictionRecord): util.List[DataRecord] = { + val newRecord = new RichDataRecord(new DataRecord) + record + .getFeatureValue(UserStateBoundFeature) + .flatMap { userState => userStateToFeatureMap.get(userState.value) }.foreach { + booleanFeature => newRecord.setFeatureValue[JBoolean](booleanFeature, true) + } + + List(newRecord.getRecord).asJava + } +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/BUILD b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/BUILD new file mode 100644 index 0000000000..93f39405d1 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/BUILD @@ -0,0 +1,199 @@ +heron_binary( + name = "heron-without-jass", + main = "com.twitter.timelines.prediction.common.aggregates.real_time.TypeSafeRunner", + oss = True, + platform = "java8", + runtime_platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + ":real_time", + "3rdparty/jvm/org/slf4j:slf4j-jdk14", + ], +) + +jvm_app( + name = "rta_heron", + binary = ":heron-without-jass", + bundles = [ + bundle( + fileset = ["resources/jaas.conf"], + ), + ], + tags = [ + "bazel-compatible", + "bazel-only", + ], +) + +scala_library( + sources = ["*.scala"], + platform = "java8", + strict_deps = False, + tags = ["bazel-compatible"], + dependencies = [ + ":online-configs", + "3rdparty/src/jvm/com/twitter/summingbird:storm", + "src/java/com/twitter/heron/util", + "src/java/com/twitter/ml/api:api-base", + "src/java/com/twitter/ml/api/constant", + "src/scala/com/twitter/frigate/data_pipeline/features_aggregated/core:core-features", + "src/scala/com/twitter/ml/api/util", + "src/scala/com/twitter/storehaus_internal/memcache", + "src/scala/com/twitter/storehaus_internal/util", + "src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits", + "src/scala/com/twitter/summingbird_internal/runner/store_config", + "src/scala/com/twitter/summingbird_internal/runner/storm", + "src/scala/com/twitter/summingbird_internal/sources/storm/remote:ClientEventSourceScrooge2", + "src/scala/com/twitter/timelines/prediction/adapters/client_log_event", + "src/scala/com/twitter/timelines/prediction/adapters/client_log_event_mr", + "src/scala/com/twitter/timelines/prediction/features/client_log_event", + "src/scala/com/twitter/timelines/prediction/features/common", + "src/scala/com/twitter/timelines/prediction/features/list_features", + "src/scala/com/twitter/timelines/prediction/features/recap", + "src/scala/com/twitter/timelines/prediction/features/user_health", + "src/thrift/com/twitter/ml/api:data-java", + "src/thrift/com/twitter/timelines/suggests/common:record-scala", + "timelinemixer/common/src/main/scala/com/twitter/timelinemixer/clients/served_features_cache", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + "timelines/data_processing/ml_util/aggregation_framework/heron", + "timelines/data_processing/ml_util/aggregation_framework/job", + "timelines/data_processing/ml_util/aggregation_framework/metrics", + "timelines/data_processing/ml_util/transforms", + "timelines/src/main/scala/com/twitter/timelines/clients/memcache_common", + "util/util-core:scala", + ], +) + +scala_library( + name = "online-configs", + sources = [ + "AuthorFeaturesAdapter.scala", + "Event.scala", + "FeatureStoreUtils.scala", + "StormAggregateSourceUtils.scala", + "TimelinesOnlineAggregationConfig.scala", + "TimelinesOnlineAggregationConfigBase.scala", + "TimelinesOnlineAggregationSources.scala", + "TimelinesStormAggregateSource.scala", + "TweetFeaturesReadableStore.scala", + "UserFeaturesAdapter.scala", + "UserFeaturesReadableStore.scala", + ], + platform = "java8", + strict_deps = True, + tags = ["bazel-compatible"], + dependencies = [ + ":base-config", + "3rdparty/src/jvm/com/twitter/scalding:db", + "3rdparty/src/jvm/com/twitter/storehaus:core", + "3rdparty/src/jvm/com/twitter/summingbird:core", + "3rdparty/src/jvm/com/twitter/summingbird:online", + "3rdparty/src/jvm/com/twitter/summingbird:storm", + "abuse/detection/src/main/thrift/com/twitter/abuse/detection/mention_interactions:thrift-scala", + "snowflake/src/main/scala/com/twitter/snowflake/id", + "snowflake/src/main/thrift:thrift-scala", + "src/java/com/twitter/ml/api:api-base", + "src/java/com/twitter/ml/api/constant", + "src/scala/com/twitter/frigate/data_pipeline/features_aggregated/core:core-features", + "src/scala/com/twitter/ml/api/util:datarecord", + "src/scala/com/twitter/ml/featurestore/catalog/datasets/geo:geo-user-location", + "src/scala/com/twitter/ml/featurestore/catalog/datasets/magicrecs:user-features", + "src/scala/com/twitter/ml/featurestore/catalog/entities/core", + "src/scala/com/twitter/ml/featurestore/catalog/features/core:user", + "src/scala/com/twitter/ml/featurestore/catalog/features/geo", + "src/scala/com/twitter/ml/featurestore/catalog/features/magicrecs:user-activity", + "src/scala/com/twitter/ml/featurestore/catalog/features/magicrecs:user-info", + "src/scala/com/twitter/ml/featurestore/catalog/features/trends:tweet_trends_scores", + "src/scala/com/twitter/ml/featurestore/lib/data", + "src/scala/com/twitter/ml/featurestore/lib/dataset/offline", + "src/scala/com/twitter/ml/featurestore/lib/export/strato:app-names", + "src/scala/com/twitter/ml/featurestore/lib/feature", + "src/scala/com/twitter/ml/featurestore/lib/online", + "src/scala/com/twitter/ml/featurestore/lib/params", + "src/scala/com/twitter/storehaus_internal/util", + "src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits", + "src/scala/com/twitter/summingbird_internal/runner/store_config", + "src/scala/com/twitter/summingbird_internal/runner/storm", + "src/scala/com/twitter/summingbird_internal/sources/common", + "src/scala/com/twitter/summingbird_internal/sources/common/remote:ClientEventSourceScrooge", + "src/scala/com/twitter/summingbird_internal/sources/storm/remote:ClientEventSourceScrooge2", + "src/scala/com/twitter/timelines/prediction/adapters/client_log_event", + "src/scala/com/twitter/timelines/prediction/adapters/client_log_event_mr", + "src/scala/com/twitter/timelines/prediction/common/adapters:base", + "src/scala/com/twitter/timelines/prediction/common/adapters:engagement-converter", + "src/scala/com/twitter/timelines/prediction/common/aggregates", + "src/scala/com/twitter/timelines/prediction/features/client_log_event", + "src/scala/com/twitter/timelines/prediction/features/common", + "src/scala/com/twitter/timelines/prediction/features/list_features", + "src/scala/com/twitter/timelines/prediction/features/recap", + "src/scala/com/twitter/timelines/prediction/features/user_health", + "src/thrift/com/twitter/clientapp/gen:clientapp-scala", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + "src/thrift/com/twitter/ml/api:data-java", + "src/thrift/com/twitter/timelines/suggests/common:engagement-java", + "src/thrift/com/twitter/timelines/suggests/common:engagement-scala", + "src/thrift/com/twitter/timelines/suggests/common:record-scala", + "src/thrift/com/twitter/timelineservice/injection:thrift-scala", + "src/thrift/com/twitter/timelineservice/server/suggests/logging:thrift-scala", + "strato/src/main/scala/com/twitter/strato/client", + "timelinemixer/common/src/main/scala/com/twitter/timelinemixer/clients/served_features_cache", + "timelines/data_processing/ad_hoc/suggests/common:raw_training_data_creator", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + "timelines/data_processing/ml_util/aggregation_framework/heron:configs", + "timelines/data_processing/ml_util/aggregation_framework/metrics", + "timelines/data_processing/ml_util/transforms", + "timelines/data_processing/util:rich-request", + "tweetsource/common/src/main/thrift:thrift-scala", + "twitter-server-internal/src/main/scala", + "unified_user_actions/client/src/main/scala/com/twitter/unified_user_actions/client/config", + "unified_user_actions/client/src/main/scala/com/twitter/unified_user_actions/client/summingbird", + "unified_user_actions/thrift/src/main/thrift/com/twitter/unified_user_actions:unified_user_actions-scala", + "util/util-core:scala", + "util/util-stats/src/main/scala/com/twitter/finagle/stats", + ], +) + +scala_library( + name = "base-config", + sources = [ + "AuthorFeaturesAdapter.scala", + "TimelinesOnlineAggregationConfigBase.scala", + "TweetFeaturesAdapter.scala", + "UserFeaturesAdapter.scala", + ], + platform = "java8", + strict_deps = True, + tags = ["bazel-compatible"], + dependencies = [ + "src/java/com/twitter/ml/api:api-base", + "src/java/com/twitter/ml/api/constant", + "src/resources/com/twitter/timelines/prediction/common/aggregates/real_time", + "src/scala/com/twitter/ml/api/util:datarecord", + "src/scala/com/twitter/ml/featurestore/catalog/datasets/magicrecs:user-features", + "src/scala/com/twitter/ml/featurestore/catalog/entities/core", + "src/scala/com/twitter/ml/featurestore/catalog/features/core:user", + "src/scala/com/twitter/ml/featurestore/catalog/features/geo", + "src/scala/com/twitter/ml/featurestore/catalog/features/magicrecs:user-activity", + "src/scala/com/twitter/ml/featurestore/catalog/features/magicrecs:user-info", + "src/scala/com/twitter/ml/featurestore/catalog/features/trends:tweet_trends_scores", + "src/scala/com/twitter/ml/featurestore/lib/data", + "src/scala/com/twitter/ml/featurestore/lib/feature", + "src/scala/com/twitter/timelines/prediction/common/adapters:base", + "src/scala/com/twitter/timelines/prediction/common/adapters:engagement-converter", + "src/scala/com/twitter/timelines/prediction/common/aggregates", + "src/scala/com/twitter/timelines/prediction/features/client_log_event", + "src/scala/com/twitter/timelines/prediction/features/common", + "src/scala/com/twitter/timelines/prediction/features/list_features", + "src/scala/com/twitter/timelines/prediction/features/recap", + "src/scala/com/twitter/timelines/prediction/features/user_health", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + "src/thrift/com/twitter/ml/api:feature_context-java", + "src/thrift/com/twitter/timelines/suggests/common:engagement-scala", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + "timelines/data_processing/ml_util/aggregation_framework/heron:base-config", + "timelines/data_processing/ml_util/aggregation_framework/metrics", + "timelines/data_processing/ml_util/transforms", + "util/util-core:scala", + "util/util-core:util-core-util", + ], +) diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/Event.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/Event.scala new file mode 100644 index 0000000000..1bd697d0df --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/Event.scala @@ -0,0 +1,11 @@ +package com.twitter.timelines.prediction.common.aggregates.real_time + +private[real_time] sealed trait Event[T] { def event: T } + +private[real_time] case class HomeEvent[T](override val event: T) extends Event[T] + +private[real_time] case class ProfileEvent[T](override val event: T) extends Event[T] + +private[real_time] case class SearchEvent[T](override val event: T) extends Event[T] + +private[real_time] case class UuaEvent[T](override val event: T) extends Event[T] diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/FeatureStoreUtils.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/FeatureStoreUtils.scala new file mode 100644 index 0000000000..156d9d35f3 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/FeatureStoreUtils.scala @@ -0,0 +1,53 @@ +package com.twitter.timelines.prediction.common.aggregates.real_time + +import com.twitter.finagle.mtls.authentication.ServiceIdentifier +import com.twitter.finagle.stats.StatsReceiver +import com.twitter.ml.featurestore.catalog.datasets.magicrecs.UserFeaturesDataset +import com.twitter.ml.featurestore.catalog.datasets.geo.GeoUserLocationDataset +import com.twitter.ml.featurestore.lib.dataset.DatasetParams +import com.twitter.ml.featurestore.lib.export.strato.FeatureStoreAppNames +import com.twitter.ml.featurestore.lib.online.FeatureStoreClient +import com.twitter.ml.featurestore.lib.params.FeatureStoreParams +import com.twitter.strato.client.{Client, Strato} +import com.twitter.strato.opcontext.Attribution.ManhattanAppId +import com.twitter.util.Duration + +private[real_time] object FeatureStoreUtils { + private def mkStratoClient(serviceIdentifier: ServiceIdentifier): Client = + Strato.client + .withMutualTls(serviceIdentifier) + .withRequestTimeout(Duration.fromMilliseconds(50)) + .build() + + private val featureStoreParams: FeatureStoreParams = + FeatureStoreParams( + perDataset = Map( + UserFeaturesDataset.id -> + DatasetParams( + stratoSuffix = Some(FeatureStoreAppNames.Timelines), + attributions = Seq(ManhattanAppId("athena", "timelines_aggregates_v2_features_by_user")) + ), + GeoUserLocationDataset.id -> + DatasetParams( + attributions = Seq(ManhattanAppId("starbuck", "timelines_geo_features_by_user")) + ) + ) + ) + + def mkFeatureStoreClient( + serviceIdentifier: ServiceIdentifier, + statsReceiver: StatsReceiver + ): FeatureStoreClient = { + com.twitter.server.Init() // necessary in order to use WilyNS path + + val stratoClient: Client = mkStratoClient(serviceIdentifier) + val featureStoreClient: FeatureStoreClient = FeatureStoreClient( + featureSet = + UserFeaturesAdapter.UserFeaturesSet ++ AuthorFeaturesAdapter.UserFeaturesSet ++ TweetFeaturesAdapter.TweetFeaturesSet, + client = stratoClient, + statsReceiver = statsReceiver, + featureStoreParams = featureStoreParams + ) + featureStoreClient + } +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/LocallyReplicatedStore.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/LocallyReplicatedStore.scala new file mode 100644 index 0000000000..42f86fa4fd --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/LocallyReplicatedStore.scala @@ -0,0 +1,79 @@ +package com.twitter.timelines.prediction.common.aggregates.real_time + +import com.twitter.conversions.DurationOps._ +import com.twitter.finagle.stats.StatsReceiver +import com.twitter.storehaus.ReplicatedReadableStore +import com.twitter.storehaus.Store +import com.twitter.timelines.clients.memcache_common._ +import com.twitter.timelines.util.FailOpenHandler +import com.twitter.util.Future + +object ServedFeaturesMemcacheConfigBuilder { + def getTwCacheDestination(cluster: String, isProd: Boolean = false): String = + if (!isProd) { + s"/srv#/test/$cluster/cache//twemcache_timelines_served_features_cache" + } else { + s"/srv#/prod/$cluster/cache/timelines_served_features" + } + + /** + * @cluster The DC of the cache that this client will send requests to. This + * can be different to the DC where the summingbird job is running in. + * @isProd Define if this client is part of a production summingbird job as + * different accesspoints will need to be chosen. + */ + def build(cluster: String, isProd: Boolean = false): StorehausMemcacheConfig = + StorehausMemcacheConfig( + destName = getTwCacheDestination(cluster, isProd), + keyPrefix = "", + requestTimeout = 200.milliseconds, + numTries = 2, + globalTimeout = 400.milliseconds, + tcpConnectTimeout = 200.milliseconds, + connectionAcquisitionTimeout = 200.milliseconds, + numPendingRequests = 1000, + isReadOnly = false + ) +} + +/** + * If lookup key does not exist locally, make a call to the replicated store(s). + * If value exists remotely, write the first returned value to the local store + * and return it. Map any exceptions to None so that the subsequent operations + * may proceed. + */ +class LocallyReplicatedStore[-K, V]( + localStore: Store[K, V], + remoteStore: ReplicatedReadableStore[K, V], + scopedStatsReceiver: StatsReceiver) + extends Store[K, V] { + private[this] val failOpenHandler = new FailOpenHandler(scopedStatsReceiver.scope("failOpen")) + private[this] val localFailsCounter = scopedStatsReceiver.counter("localFails") + private[this] val localWritesCounter = scopedStatsReceiver.counter("localWrites") + private[this] val remoteFailsCounter = scopedStatsReceiver.counter("remoteFails") + + override def get(k: K): Future[Option[V]] = + failOpenHandler { + localStore + .get(k) + .flatMap { + case Some(v) => Future.value(Some(v)) + case _ => { + localFailsCounter.incr() + val replicatedOptFu = remoteStore.get(k) + // async write if result is not empty + replicatedOptFu.onSuccess { + case Some(v) => { + localWritesCounter.incr() + localStore.put((k, Some(v))) + } + case _ => { + remoteFailsCounter.incr() + Unit + } + } + replicatedOptFu + } + } + } { _: Throwable => Future.None } +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/StormAggregateSourceUtils.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/StormAggregateSourceUtils.scala new file mode 100644 index 0000000000..e72d3392bd --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/StormAggregateSourceUtils.scala @@ -0,0 +1,254 @@ +package com.twitter.timelines.prediction.common.aggregates.real_time + +import com.twitter.finagle.stats.Counter +import com.twitter.finagle.stats.StatsReceiver +import com.twitter.ml.api.constant.SharedFeatures +import com.twitter.ml.api.DataRecord +import com.twitter.ml.api.DataRecordMerger +import com.twitter.ml.api.Feature +import com.twitter.ml.api.RichDataRecord +import com.twitter.ml.featurestore.catalog.entities.core.Author +import com.twitter.ml.featurestore.catalog.entities.core.Tweet +import com.twitter.ml.featurestore.catalog.entities.core.User +import com.twitter.ml.featurestore.lib.online.FeatureStoreClient +import com.twitter.summingbird.Producer +import com.twitter.summingbird.storm.Storm +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron.RealTimeAggregatesJobConfig +import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures +import java.lang.{Long => JLong} + +import com.twitter.unified_user_actions.thriftscala.ActionType +import com.twitter.unified_user_actions.thriftscala.UnifiedUserAction + +private[real_time] object StormAggregateSourceUtils { + type UserId = Long + type AuthorId = Long + type TweetId = Long + + /** + * Attaches a [[FeatureStoreClient]] to the underyling [[Producer]]. The FeatureStoreClient + * hydrates additional user features. + * + * @param underlyingProducer converts a stream of [[com.twitter.clientapp.thriftscala.LogEvent]] + * to a stream of [[DataRecord]]. + */ + def wrapByFeatureStoreClient( + underlyingProducer: Producer[Storm, Event[DataRecord]], + jobConfig: RealTimeAggregatesJobConfig, + scopedStatsReceiver: StatsReceiver + ): Producer[Storm, Event[DataRecord]] = { + lazy val keyDataRecordCounter = scopedStatsReceiver.counter("keyDataRecord") + lazy val keyFeatureCounter = scopedStatsReceiver.counter("keyFeature") + lazy val leftDataRecordCounter = scopedStatsReceiver.counter("leftDataRecord") + lazy val rightDataRecordCounter = scopedStatsReceiver.counter("rightDataRecord") + lazy val mergeNumFeaturesCounter = scopedStatsReceiver.counter("mergeNumFeatures") + lazy val authorKeyDataRecordCounter = scopedStatsReceiver.counter("authorKeyDataRecord") + lazy val authorKeyFeatureCounter = scopedStatsReceiver.counter("authorKeyFeature") + lazy val authorLeftDataRecordCounter = scopedStatsReceiver.counter("authorLeftDataRecord") + lazy val authorRightDataRecordCounter = scopedStatsReceiver.counter("authorRightDataRecord") + lazy val authorMergeNumFeaturesCounter = scopedStatsReceiver.counter("authorMergeNumFeatures") + lazy val tweetKeyDataRecordCounter = + scopedStatsReceiver.counter("tweetKeyDataRecord") + lazy val tweetKeyFeatureCounter = scopedStatsReceiver.counter("tweetKeyFeature") + lazy val tweetLeftDataRecordCounter = + scopedStatsReceiver.counter("tweetLeftDataRecord") + lazy val tweetRightDataRecordCounter = + scopedStatsReceiver.counter("tweetRightDataRecord") + lazy val tweetMergeNumFeaturesCounter = + scopedStatsReceiver.counter("tweetMergeNumFeatures") + + @transient lazy val featureStoreClient: FeatureStoreClient = + FeatureStoreUtils.mkFeatureStoreClient( + serviceIdentifier = jobConfig.serviceIdentifier, + statsReceiver = scopedStatsReceiver + ) + + lazy val joinUserFeaturesDataRecordProducer = + if (jobConfig.keyedByUserEnabled) { + lazy val keyedByUserFeaturesStormService: Storm#Service[Set[UserId], DataRecord] = + Storm.service( + new UserFeaturesReadableStore( + featureStoreClient = featureStoreClient, + userEntity = User, + userFeaturesAdapter = UserFeaturesAdapter + ) + ) + + leftJoinDataRecordProducer( + keyFeature = SharedFeatures.USER_ID, + leftDataRecordProducer = underlyingProducer, + rightStormService = keyedByUserFeaturesStormService, + keyDataRecordCounter = keyDataRecordCounter, + keyFeatureCounter = keyFeatureCounter, + leftDataRecordCounter = leftDataRecordCounter, + rightDataRecordCounter = rightDataRecordCounter, + mergeNumFeaturesCounter = mergeNumFeaturesCounter + ) + } else { + underlyingProducer + } + + lazy val joinAuthorFeaturesDataRecordProducer = + if (jobConfig.keyedByAuthorEnabled) { + lazy val keyedByAuthorFeaturesStormService: Storm#Service[Set[AuthorId], DataRecord] = + Storm.service( + new UserFeaturesReadableStore( + featureStoreClient = featureStoreClient, + userEntity = Author, + userFeaturesAdapter = AuthorFeaturesAdapter + ) + ) + + leftJoinDataRecordProducer( + keyFeature = TimelinesSharedFeatures.SOURCE_AUTHOR_ID, + leftDataRecordProducer = joinUserFeaturesDataRecordProducer, + rightStormService = keyedByAuthorFeaturesStormService, + keyDataRecordCounter = authorKeyDataRecordCounter, + keyFeatureCounter = authorKeyFeatureCounter, + leftDataRecordCounter = authorLeftDataRecordCounter, + rightDataRecordCounter = authorRightDataRecordCounter, + mergeNumFeaturesCounter = authorMergeNumFeaturesCounter + ) + } else { + joinUserFeaturesDataRecordProducer + } + + lazy val joinTweetFeaturesDataRecordProducer = { + if (jobConfig.keyedByTweetEnabled) { + lazy val keyedByTweetFeaturesStormService: Storm#Service[Set[TweetId], DataRecord] = + Storm.service( + new TweetFeaturesReadableStore( + featureStoreClient = featureStoreClient, + tweetEntity = Tweet, + tweetFeaturesAdapter = TweetFeaturesAdapter + ) + ) + + leftJoinDataRecordProducer( + keyFeature = TimelinesSharedFeatures.SOURCE_TWEET_ID, + leftDataRecordProducer = joinAuthorFeaturesDataRecordProducer, + rightStormService = keyedByTweetFeaturesStormService, + keyDataRecordCounter = tweetKeyDataRecordCounter, + keyFeatureCounter = tweetKeyFeatureCounter, + leftDataRecordCounter = tweetLeftDataRecordCounter, + rightDataRecordCounter = tweetRightDataRecordCounter, + mergeNumFeaturesCounter = tweetMergeNumFeaturesCounter + ) + } else { + joinAuthorFeaturesDataRecordProducer + } + } + + joinTweetFeaturesDataRecordProducer + } + + private[this] lazy val DataRecordMerger = new DataRecordMerger + + /** + * Make join key from the client event data record and return both. + * @param keyFeature Feature to extract join key value: USER_ID, SOURCE_TWEET_ID, etc. + * @param record DataRecord containing client engagement and basic tweet-side features + * @return The return type is a tuple of this key and original data record which will be used + * in the subsequent leftJoin operation. + */ + private[this] def mkKey( + keyFeature: Feature[JLong], + record: DataRecord, + keyDataRecordCounter: Counter, + keyFeatureCounter: Counter + ): Set[Long] = { + keyDataRecordCounter.incr() + val richRecord = new RichDataRecord(record) + if (richRecord.hasFeature(keyFeature)) { + keyFeatureCounter.incr() + val key: Long = richRecord.getFeatureValue(keyFeature).toLong + Set(key) + } else { + Set.empty[Long] + } + } + + /** + * After the leftJoin, merge the client event data record and the joined data record + * into a single data record used for further aggregation. + */ + private[this] def mergeDataRecord( + leftRecord: Event[DataRecord], + rightRecordOpt: Option[DataRecord], + leftDataRecordCounter: Counter, + rightDataRecordCounter: Counter, + mergeNumFeaturesCounter: Counter + ): Event[DataRecord] = { + leftDataRecordCounter.incr() + rightRecordOpt.foreach { rightRecord => + rightDataRecordCounter.incr() + DataRecordMerger.merge(leftRecord.event, rightRecord) + mergeNumFeaturesCounter.incr(new RichDataRecord(leftRecord.event).numFeatures()) + } + leftRecord + } + + private[this] def leftJoinDataRecordProducer( + keyFeature: Feature[JLong], + leftDataRecordProducer: Producer[Storm, Event[DataRecord]], + rightStormService: Storm#Service[Set[Long], DataRecord], + keyDataRecordCounter: => Counter, + keyFeatureCounter: => Counter, + leftDataRecordCounter: => Counter, + rightDataRecordCounter: => Counter, + mergeNumFeaturesCounter: => Counter + ): Producer[Storm, Event[DataRecord]] = { + val keyedLeftDataRecordProducer: Producer[Storm, (Set[Long], Event[DataRecord])] = + leftDataRecordProducer.map { + case dataRecord: HomeEvent[DataRecord] => + val key = mkKey( + keyFeature = keyFeature, + record = dataRecord.event, + keyDataRecordCounter = keyDataRecordCounter, + keyFeatureCounter = keyFeatureCounter + ) + (key, dataRecord) + case dataRecord: ProfileEvent[DataRecord] => + val key = Set.empty[Long] + (key, dataRecord) + case dataRecord: SearchEvent[DataRecord] => + val key = Set.empty[Long] + (key, dataRecord) + case dataRecord: UuaEvent[DataRecord] => + val key = Set.empty[Long] + (key, dataRecord) + } + + keyedLeftDataRecordProducer + .leftJoin(rightStormService) + .map { + case (_, (leftRecord, rightRecordOpt)) => + mergeDataRecord( + leftRecord = leftRecord, + rightRecordOpt = rightRecordOpt, + leftDataRecordCounter = leftDataRecordCounter, + rightDataRecordCounter = rightDataRecordCounter, + mergeNumFeaturesCounter = mergeNumFeaturesCounter + ) + } + } + + /** + * Filter Unified User Actions events to include only actions that has home timeline visit prior to landing on the page + */ + def isUuaBCEEventsFromHome(event: UnifiedUserAction): Boolean = { + def breadcrumbViewsContain(view: String): Boolean = + event.eventMetadata.breadcrumbViews.map(_.contains(view)).getOrElse(false) + + (event.actionType) match { + case ActionType.ClientTweetV2Impression if breadcrumbViewsContain("home") => + true + case ActionType.ClientTweetVideoFullscreenV2Impression + if (breadcrumbViewsContain("home") & breadcrumbViewsContain("video")) => + true + case ActionType.ClientProfileV2Impression if breadcrumbViewsContain("home") => + true + case _ => false + } + } +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationConfig.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationConfig.scala new file mode 100644 index 0000000000..8d7a41d21b --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationConfig.scala @@ -0,0 +1,34 @@ +package com.twitter.timelines.prediction.common.aggregates.real_time + +import com.twitter.conversions.DurationOps._ +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron.{ + OnlineAggregationStoresTrait, + RealTimeAggregateStore +} + +object TimelinesOnlineAggregationConfig + extends TimelinesOnlineAggregationDefinitionsTrait + with OnlineAggregationStoresTrait { + + import TimelinesOnlineAggregationSources._ + + override lazy val ProductionStore = RealTimeAggregateStore( + memcacheDataSet = "timelines_real_time_aggregates", + isProd = true, + cacheTTL = 5.days + ) + + override lazy val StagingStore = RealTimeAggregateStore( + memcacheDataSet = "twemcache_timelines_real_time_aggregates", + isProd = false, + cacheTTL = 5.days + ) + + override lazy val inputSource = timelinesOnlineAggregateSource + + /** + * AggregateToCompute: This defines the complete set of aggregates to be + * computed by the aggregation job and to be stored in memcache. + */ + override lazy val AggregatesToCompute = ProdAggregates ++ StagingAggregates +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationConfigBase.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationConfigBase.scala new file mode 100644 index 0000000000..0d7c072e2f --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationConfigBase.scala @@ -0,0 +1,1112 @@ +package com.twitter.timelines.prediction.common.aggregates.real_time + +import com.twitter.conversions.DurationOps._ +import com.twitter.ml.api.Feature +import com.twitter.ml.api.constant.SharedFeatures +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregateGroup +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregateSource +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregateStore +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron.OnlineAggregationConfigTrait +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.CountMetric +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.SumMetric +import com.twitter.timelines.data_processing.ml_util.transforms.BinaryUnion +import com.twitter.timelines.data_processing.ml_util.transforms.DownsampleTransform +import com.twitter.timelines.data_processing.ml_util.transforms.IsNewUserTransform +import com.twitter.timelines.data_processing.ml_util.transforms.IsPositionTransform +import com.twitter.timelines.data_processing.ml_util.transforms.LogTransform +import com.twitter.timelines.data_processing.ml_util.transforms.PositionCase +import com.twitter.timelines.data_processing.ml_util.transforms.RichITransform +import com.twitter.timelines.data_processing.ml_util.transforms.RichRemoveUnverifiedUserTransform +import com.twitter.timelines.prediction.features.client_log_event.ClientLogEventDataRecordFeatures +import com.twitter.timelines.prediction.features.common.CombinedFeatures +import com.twitter.timelines.prediction.features.common.CombinedFeatures._ +import com.twitter.timelines.prediction.features.common.ProfileLabelFeatures +import com.twitter.timelines.prediction.features.common.SearchLabelFeatures +import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures +import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures.IS_TOP_FIVE +import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures.IS_TOP_ONE +import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures.IS_TOP_TEN +import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures.LOG_POSITION +import com.twitter.timelines.prediction.features.list_features.ListFeatures +import com.twitter.timelines.prediction.features.recap.RecapFeatures +import com.twitter.util.Duration +import java.lang.{Boolean => JBoolean} +import java.lang.{Long => JLong} +import scala.io.Source + +object TimelinesOnlineAggregationUtils { + val TweetLabels: Set[Feature[JBoolean]] = CombinedFeatures.EngagementsRealTime + val TweetCoreLabels: Set[Feature[JBoolean]] = CombinedFeatures.CoreEngagements + val TweetDwellLabels: Set[Feature[JBoolean]] = CombinedFeatures.DwellEngagements + val TweetCoreAndDwellLabels: Set[Feature[JBoolean]] = TweetCoreLabels ++ TweetDwellLabels + val PrivateEngagementLabelsV2: Set[Feature[JBoolean]] = CombinedFeatures.PrivateEngagementsV2 + val ProfileCoreLabels: Set[Feature[JBoolean]] = ProfileLabelFeatures.CoreEngagements + val ProfileNegativeEngagementLabels: Set[Feature[JBoolean]] = + ProfileLabelFeatures.NegativeEngagements + val ProfileNegativeEngagementUnionLabels: Set[Feature[JBoolean]] = Set( + ProfileLabelFeatures.IS_NEGATIVE_FEEDBACK_UNION) + val SearchCoreLabels: Set[Feature[JBoolean]] = SearchLabelFeatures.CoreEngagements + val TweetNegativeEngagementLabels: Set[Feature[JBoolean]] = + CombinedFeatures.NegativeEngagementsRealTime + val TweetNegativeEngagementDontLikeLabels: Set[Feature[JBoolean]] = + CombinedFeatures.NegativeEngagementsRealTimeDontLike + val TweetNegativeEngagementSecondaryLabels: Set[Feature[JBoolean]] = + CombinedFeatures.NegativeEngagementsSecondary + val AllTweetNegativeEngagementLabels: Set[Feature[JBoolean]] = + TweetNegativeEngagementLabels ++ TweetNegativeEngagementDontLikeLabels ++ TweetNegativeEngagementSecondaryLabels + val UserAuthorEngagementLabels: Set[Feature[JBoolean]] = CombinedFeatures.UserAuthorEngagements + val ShareEngagementLabels: Set[Feature[JBoolean]] = CombinedFeatures.ShareEngagements + val BookmarkEngagementLabels: Set[Feature[JBoolean]] = CombinedFeatures.BookmarkEngagements + val AllBCEDwellLabels: Set[Feature[JBoolean]] = + CombinedFeatures.TweetDetailDwellEngagements ++ CombinedFeatures.ProfileDwellEngagements ++ CombinedFeatures.FullscreenVideoDwellEngagements + val AllTweetUnionLabels: Set[Feature[JBoolean]] = Set( + CombinedFeatures.IS_IMPLICIT_POSITIVE_FEEDBACK_UNION, + CombinedFeatures.IS_EXPLICIT_POSITIVE_FEEDBACK_UNION, + CombinedFeatures.IS_ALL_NEGATIVE_FEEDBACK_UNION + ) + val AllTweetLabels: Set[Feature[JBoolean]] = + TweetLabels ++ TweetCoreAndDwellLabels ++ AllTweetNegativeEngagementLabels ++ ProfileCoreLabels ++ ProfileNegativeEngagementLabels ++ ProfileNegativeEngagementUnionLabels ++ UserAuthorEngagementLabels ++ SearchCoreLabels ++ ShareEngagementLabels ++ BookmarkEngagementLabels ++ PrivateEngagementLabelsV2 ++ AllBCEDwellLabels ++ AllTweetUnionLabels + + def addFeatureFilterFromResource( + prodGroup: AggregateGroup, + aggRemovalPath: String + ): AggregateGroup = { + val resource = Some(Source.fromResource(aggRemovalPath)) + val lines = resource.map(_.getLines.toSeq) + lines match { + case Some(value) => prodGroup.copy(aggExclusionRegex = value) + case _ => prodGroup + } + } +} + +trait TimelinesOnlineAggregationDefinitionsTrait extends OnlineAggregationConfigTrait { + import TimelinesOnlineAggregationUtils._ + + def inputSource: AggregateSource + def ProductionStore: AggregateStore + def StagingStore: AggregateStore + + val TweetFeatures: Set[Feature[_]] = Set( + ClientLogEventDataRecordFeatures.HasConsumerVideo, + ClientLogEventDataRecordFeatures.PhotoCount + ) + val CandidateTweetSourceFeatures: Set[Feature[_]] = Set( + ClientLogEventDataRecordFeatures.FromRecap, + ClientLogEventDataRecordFeatures.FromRecycled, + ClientLogEventDataRecordFeatures.FromActivity, + ClientLogEventDataRecordFeatures.FromSimcluster, + ClientLogEventDataRecordFeatures.FromErg, + ClientLogEventDataRecordFeatures.FromCroon, + ClientLogEventDataRecordFeatures.FromList, + ClientLogEventDataRecordFeatures.FromRecTopic + ) + + def createStagingGroup(prodGroup: AggregateGroup): AggregateGroup = + prodGroup.copy( + outputStore = StagingStore + ) + + // Aggregate user engagements/features by tweet Id. + val tweetEngagement30MinuteCountsProd = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_tweet_aggregates_v1", + keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), + features = Set.empty, + labels = TweetLabels ++ TweetNegativeEngagementDontLikeLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + // Aggregate user engagements/features by tweet Id. + val tweetVerifiedDontLikeEngagementRealTimeAggregatesProd = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_tweet_aggregates_v6", + preTransforms = Seq(RichRemoveUnverifiedUserTransform), + keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), + features = Set.empty, + labels = TweetNegativeEngagementDontLikeLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, Duration.Top), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + val tweetNegativeEngagement6HourCounts = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_tweet_aggregates_v2", + keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), + features = Set.empty, + labels = TweetNegativeEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + val tweetVerifiedNegativeEngagementCounts = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_tweet_aggregates_v7", + preTransforms = Seq(RichRemoveUnverifiedUserTransform), + keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), + features = Set.empty, + labels = TweetNegativeEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, Duration.Top), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + val promotedTweetEngagementRealTimeCounts = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_tweet_aggregates_v3.is_promoted", + preTransforms = Seq( + DownsampleTransform( + negativeSamplingRate = 0.0, + keepLabels = Set(ClientLogEventDataRecordFeatures.IsPromoted))), + keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), + features = Set.empty, + labels = TweetCoreAndDwellLabels, + metrics = Set(CountMetric), + halfLives = Set(2.hours, 24.hours), + outputStore = ProductionStore, + includeAnyFeature = false, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate total engagement counts by tweet Id for non-public + * engagements. Similar to EB's public engagement counts. + */ + val tweetEngagementTotalCountsProd = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_tweet_aggregates_v1", + keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), + features = Set.empty, + labels = TweetLabels ++ TweetNegativeEngagementDontLikeLabels, + metrics = Set(CountMetric), + halfLives = Set(Duration.Top), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + val tweetNegativeEngagementTotalCounts = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_tweet_aggregates_v2", + keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), + features = Set.empty, + labels = TweetNegativeEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(Duration.Top), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate tweet features grouped by viewer's user id. + */ + val userEngagementRealTimeAggregatesProd = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_user_aggregates_v1", + keys = Set(SharedFeatures.USER_ID), + features = TweetFeatures, + labels = TweetLabels ++ TweetNegativeEngagementDontLikeLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate tweet features grouped by viewer's user id. + */ + val userEngagementRealTimeAggregatesV2 = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_user_aggregates_v2", + keys = Set(SharedFeatures.USER_ID), + features = ClientLogEventDataRecordFeatures.TweetFeaturesV2, + labels = TweetCoreAndDwellLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, Duration.Top), + outputStore = ProductionStore, + includeAnyFeature = false, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate author's user state features grouped by viewer's user id. + */ + val userEngagementAuthorUserStateRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_user_aggregates_v3", + preTransforms = Seq.empty, + keys = Set(SharedFeatures.USER_ID), + features = AuthorFeaturesAdapter.UserStateBooleanFeatures, + labels = TweetCoreAndDwellLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, Duration.Top), + outputStore = ProductionStore, + includeAnyFeature = false, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate author's user state features grouped by viewer's user id. + */ + val userNegativeEngagementAuthorUserStateRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_user_aggregates_v4", + preTransforms = Seq.empty, + keys = Set(SharedFeatures.USER_ID), + features = AuthorFeaturesAdapter.UserStateBooleanFeatures, + labels = TweetNegativeEngagementLabels ++ TweetNegativeEngagementDontLikeLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, Duration.Top), + outputStore = ProductionStore, + includeAnyFeature = false, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate tweet features grouped by viewer's user id, with 48 hour halfLife. + */ + val userEngagement48HourRealTimeAggregatesProd = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_user_aggregates_v5", + keys = Set(SharedFeatures.USER_ID), + features = TweetFeatures, + labels = TweetLabels ++ TweetNegativeEngagementDontLikeLabels, + metrics = Set(CountMetric), + halfLives = Set(48.hours), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate author's user state features grouped by viewer's user id. + */ + val userNegativeEngagementAuthorUserState72HourRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_user_aggregates_v6", + preTransforms = Seq.empty, + keys = Set(SharedFeatures.USER_ID), + features = AuthorFeaturesAdapter.UserStateBooleanFeatures, + labels = TweetNegativeEngagementLabels ++ TweetNegativeEngagementDontLikeLabels, + metrics = Set(CountMetric), + halfLives = Set(72.hours), + outputStore = ProductionStore, + includeAnyFeature = false, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate features grouped by source author id: for each author, aggregate features are created + * to quantify engagements (fav, reply, etc.) which tweets of the author has received. + */ + val authorEngagementRealTimeAggregatesProd = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_author_aggregates_v1", + keys = Set(TimelinesSharedFeatures.SOURCE_AUTHOR_ID), + features = Set.empty, + labels = TweetLabels ++ TweetNegativeEngagementDontLikeLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, Duration.Top), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate features grouped by source author id: for each author, aggregate features are created + * to quantify negative engagements (mute, block, etc.) which tweets of the author has received. + * + * This aggregate group is not used in Home, but it is used in Follow Recommendation Service so need to keep it for now. + * + */ + val authorNegativeEngagementRealTimeAggregatesProd = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_author_aggregates_v2", + keys = Set(TimelinesSharedFeatures.SOURCE_AUTHOR_ID), + features = Set.empty, + labels = TweetNegativeEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, Duration.Top), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate features grouped by source author id: for each author, aggregate features are created + * to quantify negative engagements (don't like) which tweets of the author has received from + * verified users. + */ + val authorVerifiedNegativeEngagementRealTimeAggregatesProd = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_author_aggregates_v3", + preTransforms = Seq(RichRemoveUnverifiedUserTransform), + keys = Set(TimelinesSharedFeatures.SOURCE_AUTHOR_ID), + features = Set.empty, + labels = TweetNegativeEngagementDontLikeLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate tweet features grouped by topic id. + */ + val topicEngagementRealTimeAggregatesProd = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_topic_aggregates_v1", + keys = Set(TimelinesSharedFeatures.TOPIC_ID), + features = Set.empty, + labels = TweetLabels ++ AllTweetNegativeEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, Duration.Top), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate user engagements / user state by topic id. + */ + val topicEngagementUserStateRealTimeAggregatesProd = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_topic_aggregates_v2", + keys = Set(TimelinesSharedFeatures.TOPIC_ID), + features = UserFeaturesAdapter.UserStateBooleanFeatures, + labels = TweetCoreAndDwellLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, Duration.Top), + outputStore = ProductionStore, + includeAnyFeature = false, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate user negative engagements / user state by topic id. + */ + val topicNegativeEngagementUserStateRealTimeAggregatesProd = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_topic_aggregates_v3", + keys = Set(TimelinesSharedFeatures.TOPIC_ID), + features = UserFeaturesAdapter.UserStateBooleanFeatures, + labels = TweetNegativeEngagementLabels ++ TweetNegativeEngagementDontLikeLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, Duration.Top), + outputStore = ProductionStore, + includeAnyFeature = false, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate tweet features grouped by topic id like real_time_topic_aggregates_v1 but 24hour halfLife + */ + val topicEngagement24HourRealTimeAggregatesProd = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_topic_aggregates_v4", + keys = Set(TimelinesSharedFeatures.TOPIC_ID), + features = Set.empty, + labels = TweetLabels ++ AllTweetNegativeEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(24.hours), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + // Aggregate user engagements / user state by tweet Id. + val tweetEngagementUserStateRealTimeAggregatesProd = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_tweet_aggregates_v3", + keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), + features = UserFeaturesAdapter.UserStateBooleanFeatures, + labels = TweetCoreAndDwellLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, Duration.Top), + outputStore = ProductionStore, + includeAnyFeature = false, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + // Aggregate user engagements / user gender by tweet Id. + val tweetEngagementGenderRealTimeAggregatesProd = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_tweet_aggregates_v4", + keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), + features = UserFeaturesAdapter.GenderBooleanFeatures, + labels = + TweetCoreAndDwellLabels ++ TweetNegativeEngagementLabels ++ TweetNegativeEngagementDontLikeLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, Duration.Top), + outputStore = ProductionStore, + includeAnyFeature = false, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + // Aggregate user negative engagements / user state by tweet Id. + val tweetNegativeEngagementUserStateRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_tweet_aggregates_v5", + keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), + features = UserFeaturesAdapter.UserStateBooleanFeatures, + labels = TweetNegativeEngagementLabels ++ TweetNegativeEngagementDontLikeLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, Duration.Top), + outputStore = ProductionStore, + includeAnyFeature = false, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + // Aggregate user negative engagements / user state by tweet Id. + val tweetVerifiedNegativeEngagementUserStateRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_tweet_aggregates_v8", + preTransforms = Seq(RichRemoveUnverifiedUserTransform), + keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), + features = UserFeaturesAdapter.UserStateBooleanFeatures, + labels = TweetNegativeEngagementLabels ++ TweetNegativeEngagementDontLikeLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, Duration.Top), + outputStore = ProductionStore, + includeAnyFeature = false, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate tweet engagement labels and candidate tweet source features grouped by user id. + */ + val userCandidateTweetSourceEngagementRealTimeAggregatesProd = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_user_candidate_tweet_source_aggregates_v1", + keys = Set(SharedFeatures.USER_ID), + features = CandidateTweetSourceFeatures, + labels = TweetCoreAndDwellLabels ++ NegativeEngagementsRealTimeDontLike, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, Duration.Top), + outputStore = ProductionStore, + includeAnyFeature = false, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate tweet engagement labels and candidate tweet source features grouped by user id. + */ + val userCandidateTweetSourceEngagement48HourRealTimeAggregatesProd = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_user_candidate_tweet_source_aggregates_v2", + keys = Set(SharedFeatures.USER_ID), + features = CandidateTweetSourceFeatures, + labels = TweetCoreAndDwellLabels ++ NegativeEngagementsRealTimeDontLike, + metrics = Set(CountMetric), + halfLives = Set(48.hours), + outputStore = ProductionStore, + includeAnyFeature = false, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate tweet features grouped by viewer's user id on Profile engagements + */ + val userProfileEngagementRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "profile_real_time_user_aggregates_v1", + preTransforms = Seq(IsNewUserTransform), + keys = Set(SharedFeatures.USER_ID), + features = TweetFeatures, + labels = ProfileCoreLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, Duration.Top), + outputStore = ProductionStore, + includeAnyFeature = true, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + val NegativeEngagementsUnionTransform = RichITransform( + BinaryUnion( + featuresToUnify = ProfileNegativeEngagementLabels, + outputFeature = ProfileLabelFeatures.IS_NEGATIVE_FEEDBACK_UNION + )) + + /** + * Aggregate tweet features grouped by viewer's user id on Profile negative engagements. + */ + val userProfileNegativeEngagementRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "profile_negative_engagement_real_time_user_aggregates_v1", + preTransforms = Seq(NegativeEngagementsUnionTransform), + keys = Set(SharedFeatures.USER_ID), + features = Set.empty, + labels = ProfileNegativeEngagementLabels ++ ProfileNegativeEngagementUnionLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, 72.hours, 14.day), + outputStore = ProductionStore, + includeAnyFeature = true, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate tweet features grouped by viewer's and author's user ids and on Profile engagements + */ + val userAuthorProfileEngagementRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "user_author_profile_real_time_aggregates_v1", + keys = Set(SharedFeatures.USER_ID, TimelinesSharedFeatures.SOURCE_AUTHOR_ID), + features = Set.empty, + labels = ProfileCoreLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, 24.hours, 72.hours), + outputStore = ProductionStore, + includeAnyFeature = true, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate tweet features grouped by viewer's and author's user ids and on negative Profile engagements + */ + val userAuthorProfileNegativeEngagementRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "user_author_profile_negative_engagement_real_time_aggregates_v1", + preTransforms = Seq(NegativeEngagementsUnionTransform), + keys = Set(SharedFeatures.USER_ID, TimelinesSharedFeatures.SOURCE_AUTHOR_ID), + features = Set.empty, + labels = ProfileNegativeEngagementUnionLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, 72.hours, 14.day), + outputStore = ProductionStore, + includeAnyFeature = true, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + val newUserAuthorEngagementRealTimeAggregatesProd = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_new_user_author_aggregates_v1", + preTransforms = Seq(IsNewUserTransform), + keys = Set(SharedFeatures.USER_ID, TimelinesSharedFeatures.SOURCE_AUTHOR_ID), + features = Set.empty, + labels = TweetCoreAndDwellLabels ++ Set( + IS_CLICKED, + IS_PROFILE_CLICKED, + IS_PHOTO_EXPANDED + ), + metrics = Set(CountMetric), + halfLives = Set(30.minutes, Duration.Top), + outputStore = ProductionStore, + includeAnyFeature = true, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + val userAuthorEngagementRealTimeAggregatesProd = { + // Computing user-author real-time aggregates is very expensive so we + // take the union of all major negative feedback engagements to create + // a single negtive label for aggregation. We also include a number of + // core positive engagements. + val BinaryUnionNegativeEngagements = + BinaryUnion( + featuresToUnify = AllTweetNegativeEngagementLabels, + outputFeature = IS_NEGATIVE_FEEDBACK_UNION + ) + val BinaryUnionNegativeEngagementsTransform = RichITransform(BinaryUnionNegativeEngagements) + + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_user_author_aggregates_v1", + preTransforms = Seq(BinaryUnionNegativeEngagementsTransform), + keys = Set(SharedFeatures.USER_ID, TimelinesSharedFeatures.SOURCE_AUTHOR_ID), + features = Set.empty, + labels = UserAuthorEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, 1.day), + outputStore = ProductionStore, + includeAnyFeature = true, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + } + + /** + * Aggregate tweet features grouped by list id. + */ + val listEngagementRealTimeAggregatesProd = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_list_aggregates_v1", + keys = Set(ListFeatures.LIST_ID), + features = Set.empty, + labels = + TweetCoreAndDwellLabels ++ TweetNegativeEngagementLabels ++ TweetNegativeEngagementDontLikeLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, Duration.Top), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + // Aggregate features grouped by topic of tweet and country from user's location + val topicCountryRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_topic_country_aggregates_v1", + keys = Set(TimelinesSharedFeatures.TOPIC_ID, UserFeaturesAdapter.USER_COUNTRY_ID), + features = Set.empty, + labels = + TweetCoreAndDwellLabels ++ AllTweetNegativeEngagementLabels ++ PrivateEngagementLabelsV2 ++ ShareEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, 72.hours), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + // Aggregate features grouped by TweetId_Country from user's location + val tweetCountryRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_tweet_country_aggregates_v1", + keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID, UserFeaturesAdapter.USER_COUNTRY_ID), + features = Set.empty, + labels = TweetCoreAndDwellLabels ++ AllTweetNegativeEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, Duration.Top), + outputStore = ProductionStore, + includeAnyLabel = true, + includeTimestampFeature = false, + ) + + // Additional aggregate features grouped by TweetId_Country from user's location + val tweetCountryPrivateEngagementsRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_tweet_country_aggregates_v2", + keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID, UserFeaturesAdapter.USER_COUNTRY_ID), + features = Set.empty, + labels = PrivateEngagementLabelsV2 ++ ShareEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, 72.hours), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + // Aggregate features grouped by TweetId_Country from user's location + val tweetCountryVerifiedNegativeEngagementsRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_tweet_country_aggregates_v3", + preTransforms = Seq(RichRemoveUnverifiedUserTransform), + keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID, UserFeaturesAdapter.USER_COUNTRY_ID), + features = Set.empty, + labels = AllTweetNegativeEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, Duration.Top), + outputStore = ProductionStore, + includeAnyLabel = true, + includeTimestampFeature = false, + ) + + object positionTranforms extends IsPositionTransform { + override val isInPositionRangeFeature: Seq[PositionCase] = + Seq(PositionCase(1, IS_TOP_ONE), PositionCase(5, IS_TOP_FIVE), PositionCase(10, IS_TOP_TEN)) + override val decodedPositionFeature: Feature.Discrete = + ClientLogEventDataRecordFeatures.InjectedPosition + } + + val userPositionEngagementsCountsProd = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_position_based_user_aggregates_v1", + keys = Set(SharedFeatures.USER_ID), + features = Set(IS_TOP_ONE, IS_TOP_FIVE, IS_TOP_TEN), + labels = TweetCoreAndDwellLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, 24.hours), + outputStore = ProductionStore, + preTransforms = Seq(positionTranforms), + includeAnyLabel = false, + includeAnyFeature = false, + includeTimestampFeature = false, + ) + + val userPositionEngagementsSumProd = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_position_based_user_sum_aggregates_v2", + keys = Set(SharedFeatures.USER_ID), + features = Set(LOG_POSITION), + labels = TweetCoreAndDwellLabels, + metrics = Set(SumMetric), + halfLives = Set(30.minutes, 24.hours), + outputStore = ProductionStore, + preTransforms = + Seq(new LogTransform(ClientLogEventDataRecordFeatures.InjectedPosition, LOG_POSITION)), + includeAnyLabel = false, + includeAnyFeature = false, + includeTimestampFeature = false, + ) + + // Aggregates for share engagements + val tweetShareEngagementsRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_tweet_share_aggregates_v1", + keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), + features = Set.empty, + labels = ShareEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, 24.hours), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + val userShareEngagementsRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_user_share_aggregates_v1", + keys = Set(SharedFeatures.USER_ID), + features = Set.empty, + labels = ShareEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, 24.hours), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + val userAuthorShareEngagementsRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_user_author_share_aggregates_v1", + keys = Set(SharedFeatures.USER_ID, TimelinesSharedFeatures.SOURCE_AUTHOR_ID), + features = Set.empty, + labels = ShareEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, 24.hours), + outputStore = ProductionStore, + includeAnyFeature = true, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + val topicShareEngagementsRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_topic_share_aggregates_v1", + keys = Set(TimelinesSharedFeatures.TOPIC_ID), + features = Set.empty, + labels = ShareEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, 24.hours), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + val authorShareEngagementsRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_author_share_aggregates_v1", + keys = Set(TimelinesSharedFeatures.SOURCE_AUTHOR_ID), + features = Set.empty, + labels = ShareEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, 24.hours), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + // Bookmark RTAs + val tweetBookmarkEngagementsRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_tweet_bookmark_aggregates_v1", + keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), + features = Set.empty, + labels = BookmarkEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, 24.hours), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + val userBookmarkEngagementsRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_user_bookmark_aggregates_v1", + keys = Set(SharedFeatures.USER_ID), + features = Set.empty, + labels = BookmarkEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, 24.hours), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + val userAuthorBookmarkEngagementsRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_user_author_bookmark_aggregates_v1", + keys = Set(SharedFeatures.USER_ID, TimelinesSharedFeatures.SOURCE_AUTHOR_ID), + features = Set.empty, + labels = BookmarkEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, 24.hours), + outputStore = ProductionStore, + includeAnyFeature = true, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + val authorBookmarkEngagementsRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_author_bookmark_aggregates_v1", + keys = Set(TimelinesSharedFeatures.SOURCE_AUTHOR_ID), + features = Set.empty, + labels = BookmarkEngagementLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, 24.hours), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate on user level dwell labels from BCE + */ + val userBCEDwellEngagementsRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_user_bce_dwell_aggregates", + keys = Set(SharedFeatures.USER_ID), + features = Set.empty, + labels = AllBCEDwellLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, 24.hours), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + /** + * Aggregate on tweet level dwell labels from BCE + */ + val tweetBCEDwellEngagementsRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_tweet_bce_dwell_aggregates", + keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), + features = Set.empty, + labels = AllBCEDwellLabels, + metrics = Set(CountMetric), + halfLives = Set(30.minutes, 24.hours), + outputStore = ProductionStore, + includeAnyLabel = false, + includeTimestampFeature = false, + ) + + val ImplicitPositiveEngagementsUnionTransform = RichITransform( + BinaryUnion( + featuresToUnify = CombinedFeatures.ImplicitPositiveEngagements, + outputFeature = CombinedFeatures.IS_IMPLICIT_POSITIVE_FEEDBACK_UNION + ) + ) + + val ExplicitPositiveEngagementsUnionTransform = RichITransform( + BinaryUnion( + featuresToUnify = CombinedFeatures.ExplicitPositiveEngagements, + outputFeature = CombinedFeatures.IS_EXPLICIT_POSITIVE_FEEDBACK_UNION + ) + ) + + val AllNegativeEngagementsUnionTransform = RichITransform( + BinaryUnion( + featuresToUnify = CombinedFeatures.AllNegativeEngagements, + outputFeature = CombinedFeatures.IS_ALL_NEGATIVE_FEEDBACK_UNION + ) + ) + + /** + * Aggregate features for author content preference + */ + val authorContentPreferenceRealTimeAggregates = + AggregateGroup( + inputSource = inputSource, + aggregatePrefix = "real_time_author_content_preference_aggregates", + preTransforms = Seq( + ImplicitPositiveEngagementsUnionTransform, + ExplicitPositiveEngagementsUnionTransform, + AllNegativeEngagementsUnionTransform), + keys = Set(TimelinesSharedFeatures.SOURCE_AUTHOR_ID), + features = + ClientLogEventDataRecordFeatures.AuthorContentPreferenceTweetTypeFeatures ++ AuthorFeaturesAdapter.UserStateBooleanFeatures, + labels = AllTweetUnionLabels, + metrics = Set(CountMetric), + halfLives = Set(24.hours), + outputStore = ProductionStore, + includeAnyLabel = false, + includeAnyFeature = false, + ) + + val FeaturesGeneratedByPreTransforms = Set(LOG_POSITION, IS_TOP_TEN, IS_TOP_FIVE, IS_TOP_ONE) + + val ProdAggregateGroups = Set( + tweetEngagement30MinuteCountsProd, + tweetEngagementTotalCountsProd, + tweetNegativeEngagement6HourCounts, + tweetNegativeEngagementTotalCounts, + userEngagementRealTimeAggregatesProd, + userEngagement48HourRealTimeAggregatesProd, + userNegativeEngagementAuthorUserStateRealTimeAggregates, + userNegativeEngagementAuthorUserState72HourRealTimeAggregates, + authorEngagementRealTimeAggregatesProd, + topicEngagementRealTimeAggregatesProd, + topicEngagement24HourRealTimeAggregatesProd, + tweetEngagementUserStateRealTimeAggregatesProd, + tweetNegativeEngagementUserStateRealTimeAggregates, + userProfileEngagementRealTimeAggregates, + newUserAuthorEngagementRealTimeAggregatesProd, + userAuthorEngagementRealTimeAggregatesProd, + listEngagementRealTimeAggregatesProd, + tweetCountryRealTimeAggregates, + tweetShareEngagementsRealTimeAggregates, + userShareEngagementsRealTimeAggregates, + userAuthorShareEngagementsRealTimeAggregates, + topicShareEngagementsRealTimeAggregates, + authorShareEngagementsRealTimeAggregates, + tweetBookmarkEngagementsRealTimeAggregates, + userBookmarkEngagementsRealTimeAggregates, + userAuthorBookmarkEngagementsRealTimeAggregates, + authorBookmarkEngagementsRealTimeAggregates, + topicCountryRealTimeAggregates, + tweetCountryPrivateEngagementsRealTimeAggregates, + userBCEDwellEngagementsRealTimeAggregates, + tweetBCEDwellEngagementsRealTimeAggregates, + authorContentPreferenceRealTimeAggregates, + authorVerifiedNegativeEngagementRealTimeAggregatesProd, + tweetVerifiedDontLikeEngagementRealTimeAggregatesProd, + tweetVerifiedNegativeEngagementCounts, + tweetVerifiedNegativeEngagementUserStateRealTimeAggregates, + tweetCountryVerifiedNegativeEngagementsRealTimeAggregates + ).map( + addFeatureFilterFromResource( + _, + "com/twitter/timelines/prediction/common/aggregates/real_time/aggregates_to_drop.txt")) + + val StagingAggregateGroups = ProdAggregateGroups.map(createStagingGroup) + + /** + * Contains the fully typed aggregate groups from which important + * values can be derived e.g. the features to be computed, halflives etc. + */ + override val ProdAggregates = ProdAggregateGroups.flatMap(_.buildTypedAggregateGroups()) + + override val StagingAggregates = StagingAggregateGroups.flatMap(_.buildTypedAggregateGroups()) + + + override val ProdCommonAggregates = ProdAggregates + .filter(_.keysToAggregate == Set(SharedFeatures.USER_ID)) + + /** + * This defines the set of selected features from a candidate + * that we'd like to send to the served features cache by TLM. + * These should include interesting and necessary features that + * cannot be extracted from LogEvents only by the real-time aggregates + * job. If you are adding new AggregateGroups requiring TLM-side + * candidate features, make sure to add them here. + */ + val candidateFeaturesToCache: Set[Feature[_]] = Set( + TimelinesSharedFeatures.SOURCE_AUTHOR_ID, + RecapFeatures.HASHTAGS, + RecapFeatures.MENTIONED_SCREEN_NAMES, + RecapFeatures.URL_DOMAINS + ) +} + +/** + * This config should only be used to access the aggregate features constructed by the + * aggregation config, and not for implementing an online real-time aggregates job. + */ +object TimelinesOnlineAggregationFeaturesOnlyConfig + extends TimelinesOnlineAggregationDefinitionsTrait { + + private[real_time] case class DummyAggregateSource(name: String, timestampFeature: Feature[JLong]) + extends AggregateSource + + private[real_time] case class DummyAggregateStore(name: String) extends AggregateStore + + override lazy val inputSource = DummyAggregateSource( + name = "timelines_rta", + timestampFeature = SharedFeatures.TIMESTAMP + ) + override lazy val ProductionStore = DummyAggregateStore("timelines_rta") + override lazy val StagingStore = DummyAggregateStore("timelines_rta") + + override lazy val AggregatesToCompute = ProdAggregates ++ StagingAggregates +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationSources.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationSources.scala new file mode 100644 index 0000000000..71e97a1b1e --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationSources.scala @@ -0,0 +1,5 @@ +package com.twitter.timelines.prediction.common.aggregates.real_time + +object TimelinesOnlineAggregationSources { + val timelinesOnlineAggregateSource = new TimelinesStormAggregateSource +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesRealTimeAggregatesJob.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesRealTimeAggregatesJob.scala new file mode 100644 index 0000000000..e386d4da17 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesRealTimeAggregatesJob.scala @@ -0,0 +1,182 @@ +package com.twitter.timelines.prediction.common.aggregates.real_time + +import com.twitter.conversions.DurationOps._ +import com.twitter.finagle.stats.DefaultStatsReceiver +import com.twitter.summingbird.Options +import com.twitter.summingbird.online.option.FlatMapParallelism +import com.twitter.summingbird.online.option.SourceParallelism +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron._ +import com.twitter.timelines.data_processing.ml_util.transforms.DownsampleTransform +import com.twitter.timelines.data_processing.ml_util.transforms.RichITransform +import com.twitter.timelines.data_processing.ml_util.transforms.UserDownsampleTransform + +import com.twitter.timelines.prediction.common.aggregates.BCELabelTransformFromUUADataRecord + +/** + * Sets up relevant topology parameters. Our primary goal is to handle the + * LogEvent stream and aggregate (sum) on the parsed DataRecords without falling + * behind. Our constraint is the resulting write (and read) QPS to the backing + * memcache store. + * + * If the job is falling behind, add more flatMappers and/or Summers after + * inspecting the viz panels for the respective job (go/heron-ui). An increase in + * Summers (and/or aggregation keys and features in the config) results in an + * increase in memcache QPS (go/cb and search for our cache). Adjust with CacheSize + * settings until QPS is well-controlled. + * + */ +object TimelinesRealTimeAggregatesJobConfigs extends RealTimeAggregatesJobConfigs { + import TimelinesOnlineAggregationUtils._ + + /** + * We remove input records that do not contain a label/engagement as defined in AllTweetLabels, which includes + * explicit user engagements including public, private and impression events. By avoiding ingesting records without + * engagemnts, we guarantee that no distribution shifts occur in computed aggregate features when we add a new spout + * to input aggregate sources. Counterfactual signal is still available since we aggregate on explicit dwell + * engagements. + */ + val NegativeDownsampleTransform = + DownsampleTransform( + negativeSamplingRate = 0.0, + keepLabels = AllTweetLabels, + positiveSamplingRate = 1.0) + + /** + * We downsample positive engagements for devel topology to reduce traffic, aiming for equivalent of 10% of prod traffic. + * First apply consistent downsampling to 10% of users, and then apply downsampling to remove records without + * explicit labels. We apply user-consistent sampling to more closely approximate prod query patterns. + */ + val StagingUserBasedDownsampleTransform = + UserDownsampleTransform( + availability = 1000, + featureName = "rta_devel" + ) + + override val Prod = RealTimeAggregatesJobConfig( + appId = "summingbird_timelines_rta", + topologyWorkers = 1450, + sourceCount = 120, + flatMapCount = 1800, + summerCount = 3850, + cacheSize = 200, + containerRamGigaBytes = 54, + name = "timelines_real_time_aggregates", + teamName = "timelines", + teamEmail = "", + // If one component is hitting GC limit at prod, tune componentToMetaSpaceSizeMap. + // Except for Source bolts. Tune componentToRamGigaBytesMap for Source bolts instead. + componentToMetaSpaceSizeMap = Map( + "Tail-FlatMap" -> "-XX:MaxMetaspaceSize=1024M -XX:MetaspaceSize=1024M", + "Tail" -> "-XX:MaxMetaspaceSize=2560M -XX:MetaspaceSize=2560M" + ), + // If either component is hitting memory limit at prod + // its memory need to increase: either increase total memory of container (containerRamGigaBytes), + // or allocate more memory for one component while keeping total memory unchanged. + componentToRamGigaBytesMap = Map( + "Tail-FlatMap-Source" -> 3, // Home source + "Tail-FlatMap-Source.2" -> 3, // Profile source + "Tail-FlatMap-Source.3" -> 3, // Search source + "Tail-FlatMap-Source.4" -> 3, // UUA source + "Tail-FlatMap" -> 8 + // Tail will use the leftover memory in the container. + // Make sure to tune topologyWorkers and containerRamGigaBytes such that this is greater than 10 GB. + ), + topologyNamedOptions = Map( + "TL_EVENTS_SOURCE" -> Options() + .set(SourceParallelism(120)), + "PROFILE_EVENTS_SOURCE" -> Options() + .set(SourceParallelism(30)), + "SEARCH_EVENTS_SOURCE" -> Options() + .set(SourceParallelism(10)), + "UUA_EVENTS_SOURCE" -> Options() + .set(SourceParallelism(10)), + "COMBINED_PRODUCER" -> Options() + .set(FlatMapParallelism(1800)) + ), + // The UUA datarecord for BCE events inputted will not have binary labels populated. + // BCELabelTransform will set the datarecord with binary BCE dwell labels features based on the corresponding dwell_time_ms. + // It's important to have the BCELabelTransformFromUUADataRecord before ProdNegativeDownsampleTransform + // because ProdNegativeDownsampleTransform will remove datarecord that contains no features from AllTweetLabels. + onlinePreTransforms = + Seq(RichITransform(BCELabelTransformFromUUADataRecord), NegativeDownsampleTransform) + ) + + /** + * we downsample 10% computation of devel RTA based on [[StagingNegativeDownsampleTransform]]. + * To better test scalability of topology, we reduce computing resource of components "Tail-FlatMap" + * and "Tail" to be 10% of prod but keep computing resource of component "Tail-FlatMap-Source" unchanged. + * hence flatMapCount=110, summerCount=105 and sourceCount=100. Hence topologyWorkers =(110+105+100)/5 = 63. + */ + override val Devel = RealTimeAggregatesJobConfig( + appId = "summingbird_timelines_rta_devel", + topologyWorkers = 120, + sourceCount = 120, + flatMapCount = 150, + summerCount = 300, + cacheSize = 200, + containerRamGigaBytes = 54, + name = "timelines_real_time_aggregates_devel", + teamName = "timelines", + teamEmail = "", + // If one component is hitting GC limit at prod, tune componentToMetaSpaceSizeMap + // Except for Source bolts. Tune componentToRamGigaBytesMap for Source bolts instead. + componentToMetaSpaceSizeMap = Map( + "Tail-FlatMap" -> "-XX:MaxMetaspaceSize=1024M -XX:MetaspaceSize=1024M", + "Tail" -> "-XX:MaxMetaspaceSize=2560M -XX:MetaspaceSize=2560M" + ), + // If either component is hitting memory limit at prod + // its memory need to increase: either increase total memory of container (containerRamGigaBytes), + // or allocate more memory for one component while keeping total memory unchanged. + componentToRamGigaBytesMap = Map( + "Tail-FlatMap-Source" -> 3, // Home source + "Tail-FlatMap-Source.2" -> 3, // Profile source + "Tail-FlatMap-Source.3" -> 3, // Search source + "Tail-FlatMap-Source.4" -> 3, // UUA source + "Tail-FlatMap" -> 8 + // Tail will use the leftover memory in the container. + // Make sure to tune topologyWorkers and containerRamGigaBytes such that this is greater than 10 GB. + ), + topologyNamedOptions = Map( + "TL_EVENTS_SOURCE" -> Options() + .set(SourceParallelism(120)), + "PROFILE_EVENTS_SOURCE" -> Options() + .set(SourceParallelism(30)), + "SEARCH_EVENTS_SOURCE" -> Options() + .set(SourceParallelism(10)), + "UUA_EVENTS_SOURCE" -> Options() + .set(SourceParallelism(10)), + "COMBINED_PRODUCER" -> Options() + .set(FlatMapParallelism(150)) + ), + // It's important to have the BCELabelTransformFromUUADataRecord before ProdNegativeDownsampleTransform + onlinePreTransforms = Seq( + StagingUserBasedDownsampleTransform, + RichITransform(BCELabelTransformFromUUADataRecord), + NegativeDownsampleTransform), + enableUserReindexingNighthawkBtreeStore = true, + enableUserReindexingNighthawkHashStore = true, + userReindexingNighthawkBtreeStoreConfig = NighthawkUnderlyingStoreConfig( + serversetPath = + "/twitter/service/cache-user/test/nighthawk_timelines_real_time_aggregates_btree_test_api", + // NOTE: table names are prefixed to every pkey so keep it short + tableName = "u_r_v1", // (u)ser_(r)eindexing_v1 + // keep ttl <= 1 day because it's keyed on user, and we will have limited hit rates beyond 1 day + cacheTTL = 1.day + ), + userReindexingNighthawkHashStoreConfig = NighthawkUnderlyingStoreConfig( + // For prod: "/s/cache-user/nighthawk_timelines_real_time_aggregates_hash_api", + serversetPath = + "/twitter/service/cache-user/test/nighthawk_timelines_real_time_aggregates_hash_test_api", + // NOTE: table names are prefixed to every pkey so keep it short + tableName = "u_r_v1", // (u)ser_(r)eindexing_v1 + // keep ttl <= 1 day because it's keyed on user, and we will have limited hit rates beyond 1 day + cacheTTL = 1.day + ) + ) +} + +object TimelinesRealTimeAggregatesJob extends RealTimeAggregatesJobBase { + override lazy val statsReceiver = DefaultStatsReceiver.scope("timelines_real_time_aggregates") + override lazy val jobConfigs = TimelinesRealTimeAggregatesJobConfigs + override lazy val aggregatesToCompute = TimelinesOnlineAggregationConfig.AggregatesToCompute +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesStormAggregateSource.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesStormAggregateSource.scala new file mode 100644 index 0000000000..2e096dc079 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesStormAggregateSource.scala @@ -0,0 +1,185 @@ +package com.twitter.timelines.prediction.common.aggregates.real_time + +import com.twitter.clientapp.thriftscala.LogEvent +import com.twitter.conversions.DurationOps._ +import com.twitter.finagle.stats.Counter +import com.twitter.finagle.stats.StatsReceiver +import com.twitter.ml.api.DataRecord +import com.twitter.ml.api.constant.SharedFeatures +import com.twitter.snowflake.id.SnowflakeId +import com.twitter.summingbird._ +import com.twitter.summingbird.storm.Storm +import com.twitter.summingbird_internal.sources.AppId +import com.twitter.summingbird_internal.sources.storm.remote.ClientEventSourceScrooge2 +import com.twitter.timelines.data_processing.ad_hoc.suggests.common.AllScribeProcessor +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron.RealTimeAggregatesJobConfig +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron.StormAggregateSource +import com.twitter.timelines.prediction.adapters.client_log_event.ClientLogEventAdapter +import com.twitter.timelines.prediction.adapters.client_log_event.ProfileClientLogEventAdapter +import com.twitter.timelines.prediction.adapters.client_log_event.SearchClientLogEventAdapter +import com.twitter.timelines.prediction.adapters.client_log_event.UuaEventAdapter +import com.twitter.unified_user_actions.client.config.KafkaConfigs +import com.twitter.unified_user_actions.client.summingbird.UnifiedUserActionsSourceScrooge +import com.twitter.unified_user_actions.thriftscala.UnifiedUserAction +import scala.collection.JavaConverters._ + +/** + * Storm Producer for client events generated on Home, Profile, and Search + */ +class TimelinesStormAggregateSource extends StormAggregateSource { + + override val name = "timelines_rta" + override val timestampFeature = SharedFeatures.TIMESTAMP + + private lazy val TimelinesClientEventSourceName = "TL_EVENTS_SOURCE" + private lazy val ProfileClientEventSourceName = "PROFILE_EVENTS_SOURCE" + private lazy val SearchClientEventSourceName = "SEARCH_EVENTS_SOURCE" + private lazy val UuaEventSourceName = "UUA_EVENTS_SOURCE" + private lazy val CombinedProducerName = "COMBINED_PRODUCER" + private lazy val FeatureStoreProducerName = "FEATURE_STORE_PRODUCER" + + private def isNewUserEvent(event: LogEvent): Boolean = { + event.logBase.flatMap(_.userId).flatMap(SnowflakeId.timeFromIdOpt).exists(_.untilNow < 30.days) + } + + private def mkDataRecords(event: LogEvent, dataRecordCounter: Counter): Seq[DataRecord] = { + val dataRecords: Seq[DataRecord] = + if (AllScribeProcessor.isValidSuggestTweetEvent(event)) { + ClientLogEventAdapter.adaptToDataRecords(event).asScala + } else { + Seq.empty[DataRecord] + } + dataRecordCounter.incr(dataRecords.size) + dataRecords + } + + private def mkProfileDataRecords( + event: LogEvent, + dataRecordCounter: Counter + ): Seq[DataRecord] = { + val dataRecords: Seq[DataRecord] = + ProfileClientLogEventAdapter.adaptToDataRecords(event).asScala + dataRecordCounter.incr(dataRecords.size) + dataRecords + } + + private def mkSearchDataRecords( + event: LogEvent, + dataRecordCounter: Counter + ): Seq[DataRecord] = { + val dataRecords: Seq[DataRecord] = + SearchClientLogEventAdapter.adaptToDataRecords(event).asScala + dataRecordCounter.incr(dataRecords.size) + dataRecords + } + + private def mkUuaDataRecords( + event: UnifiedUserAction, + dataRecordCounter: Counter + ): Seq[DataRecord] = { + val dataRecords: Seq[DataRecord] = + UuaEventAdapter.adaptToDataRecords(event).asScala + dataRecordCounter.incr(dataRecords.size) + dataRecords + } + + override def build( + statsReceiver: StatsReceiver, + jobConfig: RealTimeAggregatesJobConfig + ): Producer[Storm, DataRecord] = { + lazy val scopedStatsReceiver = statsReceiver.scope(getClass.getSimpleName) + lazy val dataRecordCounter = scopedStatsReceiver.counter("dataRecord") + + // Home Timeline Engagements + // Step 1: => LogEvent + lazy val clientEventProducer: Producer[Storm, HomeEvent[LogEvent]] = + ClientEventSourceScrooge2( + appId = AppId(jobConfig.appId), + topic = "julep_client_event_suggests", + resumeAtLastReadOffset = false, + enableTls = true + ).source.map(HomeEvent[LogEvent]).name(TimelinesClientEventSourceName) + + // Profile Engagements + // Step 1: => LogEvent + lazy val profileClientEventProducer: Producer[Storm, ProfileEvent[LogEvent]] = + ClientEventSourceScrooge2( + appId = AppId(jobConfig.appId), + topic = "julep_client_event_profile_real_time_engagement_metrics", + resumeAtLastReadOffset = false, + enableTls = true + ).source + .map(ProfileEvent[LogEvent]) + .name(ProfileClientEventSourceName) + + // Search Engagements + // Step 1: => LogEvent + // Only process events for all users to save resource + lazy val searchClientEventProducer: Producer[Storm, SearchEvent[LogEvent]] = + ClientEventSourceScrooge2( + appId = AppId(jobConfig.appId), + topic = "julep_client_event_search_real_time_engagement_metrics", + resumeAtLastReadOffset = false, + enableTls = true + ).source + .map(SearchEvent[LogEvent]) + .name(SearchClientEventSourceName) + + // Unified User Actions (includes Home and other product surfaces) + lazy val uuaEventProducer: Producer[Storm, UuaEvent[UnifiedUserAction]] = + UnifiedUserActionsSourceScrooge( + appId = AppId(jobConfig.appId), + parallelism = 10, + kafkaConfig = KafkaConfigs.ProdUnifiedUserActionsEngagementOnly + ).source + .filter(StormAggregateSourceUtils.isUuaBCEEventsFromHome(_)) + .map(UuaEvent[UnifiedUserAction]) + .name(UuaEventSourceName) + + // Combined + // Step 2: + // (a) Combine + // (b) Transform LogEvent => Seq[DataRecord] + // (c) Apply sampler + lazy val combinedClientEventDataRecordProducer: Producer[Storm, Event[DataRecord]] = + profileClientEventProducer // This becomes the bottom branch + .merge(clientEventProducer) // This becomes the middle branch + .merge(searchClientEventProducer) + .merge(uuaEventProducer) // This becomes the top + .flatMap { // LogEvent => Seq[DataRecord] + case e: HomeEvent[LogEvent] => + mkDataRecords(e.event, dataRecordCounter).map(HomeEvent[DataRecord]) + case e: ProfileEvent[LogEvent] => + mkProfileDataRecords(e.event, dataRecordCounter).map(ProfileEvent[DataRecord]) + case e: SearchEvent[LogEvent] => + mkSearchDataRecords(e.event, dataRecordCounter).map(SearchEvent[DataRecord]) + case e: UuaEvent[UnifiedUserAction] => + mkUuaDataRecords( + e.event, + dataRecordCounter + ).map(UuaEvent[DataRecord]) + } + .flatMap { // Apply sampler + case e: HomeEvent[DataRecord] => + jobConfig.sequentiallyTransform(e.event).map(HomeEvent[DataRecord]) + case e: ProfileEvent[DataRecord] => + jobConfig.sequentiallyTransform(e.event).map(ProfileEvent[DataRecord]) + case e: SearchEvent[DataRecord] => + jobConfig.sequentiallyTransform(e.event).map(SearchEvent[DataRecord]) + case e: UuaEvent[DataRecord] => + jobConfig.sequentiallyTransform(e.event).map(UuaEvent[DataRecord]) + } + .name(CombinedProducerName) + + // Step 3: Join with Feature Store features + lazy val featureStoreDataRecordProducer: Producer[Storm, DataRecord] = + StormAggregateSourceUtils + .wrapByFeatureStoreClient( + underlyingProducer = combinedClientEventDataRecordProducer, + jobConfig = jobConfig, + scopedStatsReceiver = scopedStatsReceiver + ).map(_.event).name(FeatureStoreProducerName) + + featureStoreDataRecordProducer + } +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TweetFeaturesAdapter.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TweetFeaturesAdapter.scala new file mode 100644 index 0000000000..0d5c06d7cf --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TweetFeaturesAdapter.scala @@ -0,0 +1,35 @@ +package com.twitter.timelines.prediction.common.aggregates.real_time + +import com.twitter.ml.api.DataRecord +import com.twitter.ml.api.Feature +import com.twitter.ml.api.FeatureContext +import com.twitter.ml.featurestore.catalog.entities.core.Tweet +import com.twitter.ml.featurestore.catalog.features.trends.TweetTrendsScores +import com.twitter.ml.featurestore.lib.TweetId +import com.twitter.ml.featurestore.lib.data.PredictionRecord +import com.twitter.ml.featurestore.lib.data.PredictionRecordAdapter +import com.twitter.ml.featurestore.lib.feature.BoundFeature +import com.twitter.ml.featurestore.lib.feature.BoundFeatureSet +import com.twitter.timelines.prediction.common.adapters.TimelinesAdapterBase +import java.util +import scala.collection.JavaConverters._ + +object TweetFeaturesAdapter extends TimelinesAdapterBase[PredictionRecord] { + + private val ContinuousFeatureMap: Map[BoundFeature[TweetId, Double], Feature.Continuous] = Map() + + val TweetFeaturesSet: BoundFeatureSet = new BoundFeatureSet(ContinuousFeatureMap.keys.toSet) + + val AllFeatures: Seq[Feature[_]] = + ContinuousFeatureMap.values.toSeq + + private val adapter = PredictionRecordAdapter.oneToOne(TweetFeaturesSet) + + override def getFeatureContext: FeatureContext = new FeatureContext(AllFeatures: _*) + + override def commonFeatures: Set[Feature[_]] = Set.empty + + override def adaptToDataRecords(record: PredictionRecord): util.List[DataRecord] = { + List(adapter.adaptToDataRecord(record)).asJava + } +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TweetFeaturesReadableStore.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TweetFeaturesReadableStore.scala new file mode 100644 index 0000000000..b461e179a9 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TweetFeaturesReadableStore.scala @@ -0,0 +1,53 @@ +package com.twitter.timelines.prediction.common.aggregates.real_time + +import com.twitter.ml.api.DataRecord +import com.twitter.ml.featurestore.lib.TweetId +import com.twitter.ml.featurestore.lib.data.PredictionRecord +import com.twitter.ml.featurestore.lib.entity.Entity +import com.twitter.ml.featurestore.lib.online.{FeatureStoreClient, FeatureStoreRequest} +import com.twitter.storehaus.ReadableStore +import com.twitter.timelines.prediction.common.adapters.TimelinesAdapterBase +import com.twitter.util.Future +import scala.collection.JavaConverters._ + +class TweetFeaturesReadableStore( + featureStoreClient: FeatureStoreClient, + tweetEntity: Entity[TweetId], + tweetFeaturesAdapter: TimelinesAdapterBase[PredictionRecord]) + extends ReadableStore[Set[Long], DataRecord] { + + override def multiGet[K <: Set[Long]](keys: Set[K]): Map[K, Future[Option[DataRecord]]] = { + val orderedKeys: Seq[K] = keys.toSeq + val featureStoreRequests: Seq[FeatureStoreRequest] = getFeatureStoreRequests(orderedKeys) + val predictionRecordsFut: Future[Seq[PredictionRecord]] = featureStoreClient( + featureStoreRequests) + + getDataRecordMap(orderedKeys, predictionRecordsFut) + } + + private def getFeatureStoreRequests[K <: Set[Long]]( + orderedKeys: Seq[K] + ): Seq[FeatureStoreRequest] = { + orderedKeys.map { key: Set[Long] => + FeatureStoreRequest( + entityIds = key.map { tweetId => tweetEntity.withId(TweetId(tweetId)) }.toSeq + ) + } + } + + private def getDataRecordMap[K <: Set[Long]]( + orderedKeys: Seq[K], + predictionRecordsFut: Future[Seq[PredictionRecord]] + ): Map[K, Future[Option[DataRecord]]] = { + orderedKeys.zipWithIndex.map { + case (tweetIdSet, index) => + val dataRecordFutOpt: Future[Option[DataRecord]] = predictionRecordsFut.map { + predictionRecords => + predictionRecords.lift(index).flatMap { predictionRecordAtIndex: PredictionRecord => + tweetFeaturesAdapter.adaptToDataRecords(predictionRecordAtIndex).asScala.headOption + } + } + (tweetIdSet, dataRecordFutOpt) + }.toMap + } +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TypeSafeRunner.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TypeSafeRunner.scala new file mode 100644 index 0000000000..92b6618e43 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TypeSafeRunner.scala @@ -0,0 +1,7 @@ +package com.twitter.timelines.prediction.common.aggregates.real_time + +import com.twitter.summingbird_internal.runner.storm.GenericRunner + +object TypeSafeRunner { + def main(args: Array[String]): Unit = GenericRunner(args, TimelinesRealTimeAggregatesJob(_)) +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/UserFeaturesAdapter.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/UserFeaturesAdapter.scala new file mode 100644 index 0000000000..8ff39938cf --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/UserFeaturesAdapter.scala @@ -0,0 +1,108 @@ +package com.twitter.timelines.prediction.common.aggregates.real_time + +import com.twitter.dal.personal_data.thriftjava.PersonalDataType.InferredGender +import com.twitter.dal.personal_data.thriftjava.PersonalDataType.UserState +import com.twitter.ml.api.Feature.Binary +import com.twitter.ml.api.Feature.Text +import com.twitter.ml.api.DataRecord +import com.twitter.ml.api.Feature +import com.twitter.ml.api.FeatureContext +import com.twitter.ml.api.RichDataRecord +import com.twitter.ml.featurestore.catalog.entities.core.User +import com.twitter.ml.featurestore.catalog.features.core.UserAccount +import com.twitter.ml.featurestore.catalog.features.geo.UserLocation +import com.twitter.ml.featurestore.catalog.features.magicrecs.UserActivity +import com.twitter.ml.featurestore.lib.EntityId +import com.twitter.ml.featurestore.lib.data.PredictionRecord +import com.twitter.ml.featurestore.lib.feature.BoundFeature +import com.twitter.ml.featurestore.lib.feature.BoundFeatureSet +import com.twitter.ml.featurestore.lib.UserId +import com.twitter.ml.featurestore.lib.{Discrete => FSDiscrete} +import com.twitter.timelines.prediction.common.adapters.TimelinesAdapterBase +import com.twitter.timelines.prediction.features.user_health.UserHealthFeatures +import java.lang.{Boolean => JBoolean} +import java.lang.{String => JString} +import java.util +import scala.collection.JavaConverters._ + +object UserFeaturesAdapter extends TimelinesAdapterBase[PredictionRecord] { + val UserStateBoundFeature: BoundFeature[UserId, FSDiscrete] = UserActivity.UserState.bind(User) + + /** + * Boolean features about viewer's user state. + * enum UserState { + * NEW = 0, + * NEAR_ZERO = 1, + * VERY_LIGHT = 2, + * LIGHT = 3, + * MEDIUM_TWEETER = 4, + * MEDIUM_NON_TWEETER = 5, + * HEAVY_NON_TWEETER = 6, + * HEAVY_TWEETER = 7 + * }(persisted='true') + */ + val IS_USER_NEW = new Binary("timelines.user_state.is_user_new", Set(UserState).asJava) + val IS_USER_LIGHT = new Binary("timelines.user_state.is_user_light", Set(UserState).asJava) + val IS_USER_MEDIUM_TWEETER = + new Binary("timelines.user_state.is_user_medium_tweeter", Set(UserState).asJava) + val IS_USER_MEDIUM_NON_TWEETER = + new Binary("timelines.user_state.is_user_medium_non_tweeter", Set(UserState).asJava) + val IS_USER_HEAVY_NON_TWEETER = + new Binary("timelines.user_state.is_user_heavy_non_tweeter", Set(UserState).asJava) + val IS_USER_HEAVY_TWEETER = + new Binary("timelines.user_state.is_user_heavy_tweeter", Set(UserState).asJava) + val userStateToFeatureMap: Map[Long, Binary] = Map( + 0L -> IS_USER_NEW, + 1L -> IS_USER_LIGHT, + 2L -> IS_USER_LIGHT, + 3L -> IS_USER_LIGHT, + 4L -> IS_USER_MEDIUM_TWEETER, + 5L -> IS_USER_MEDIUM_NON_TWEETER, + 6L -> IS_USER_HEAVY_NON_TWEETER, + 7L -> IS_USER_HEAVY_TWEETER + ) + + val UserStateBooleanFeatures: Set[Feature[_]] = userStateToFeatureMap.values.toSet + + + val USER_COUNTRY_ID = new Text("geo.user_location.country_code") + val UserCountryCodeFeature: BoundFeature[UserId, String] = + UserLocation.CountryCodeAlpha2.bind(User) + val UserLocationFeatures: Set[Feature[_]] = Set(USER_COUNTRY_ID) + + private val UserVerifiedFeaturesSet = Set( + UserAccount.IsUserVerified.bind(User), + UserAccount.IsUserBlueVerified.bind(User), + UserAccount.IsUserGoldVerified.bind(User), + UserAccount.IsUserGrayVerified.bind(User) + ) + + val UserFeaturesSet: BoundFeatureSet = + BoundFeatureSet(UserStateBoundFeature, UserCountryCodeFeature) ++ + BoundFeatureSet(UserVerifiedFeaturesSet.asInstanceOf[Set[BoundFeature[_ <: EntityId, _]]]) + + private val allFeatures: Seq[Feature[_]] = + UserStateBooleanFeatures.toSeq ++ GenderBooleanFeatures.toSeq ++ + UserLocationFeatures.toSeq ++ Seq(UserHealthFeatures.IsUserVerifiedUnion) + + override def getFeatureContext: FeatureContext = new FeatureContext(allFeatures: _*) + override def commonFeatures: Set[Feature[_]] = Set.empty + + override def adaptToDataRecords(record: PredictionRecord): util.List[DataRecord] = { + val newRecord = new RichDataRecord(new DataRecord) + record + .getFeatureValue(UserStateBoundFeature) + .flatMap { userState => userStateToFeatureMap.get(userState.value) }.foreach { + booleanFeature => newRecord.setFeatureValue[JBoolean](booleanFeature, true) + } + record.getFeatureValue(UserCountryCodeFeature).foreach { countryCodeFeatureValue => + newRecord.setFeatureValue[JString](USER_COUNTRY_ID, countryCodeFeatureValue) + } + + val isUserVerifiedUnion = + UserVerifiedFeaturesSet.exists(feature => record.getFeatureValue(feature).getOrElse(false)) + newRecord.setFeatureValue[JBoolean](UserHealthFeatures.IsUserVerifiedUnion, isUserVerifiedUnion) + + List(newRecord.getRecord).asJava + } +} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/UserFeaturesReadableStore.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/UserFeaturesReadableStore.scala new file mode 100644 index 0000000000..c1931c32bc --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/UserFeaturesReadableStore.scala @@ -0,0 +1,37 @@ +package com.twitter.timelines.prediction.common.aggregates.real_time + +import com.twitter.ml.api.DataRecord +import com.twitter.ml.featurestore.lib.UserId +import com.twitter.ml.featurestore.lib.data.PredictionRecord +import com.twitter.ml.featurestore.lib.entity.Entity +import com.twitter.ml.featurestore.lib.online.{FeatureStoreClient, FeatureStoreRequest} +import com.twitter.storehaus.ReadableStore +import com.twitter.timelines.prediction.common.adapters.TimelinesAdapterBase +import com.twitter.util.Future +import scala.collection.JavaConverters._ + +class UserFeaturesReadableStore( + featureStoreClient: FeatureStoreClient, + userEntity: Entity[UserId], + userFeaturesAdapter: TimelinesAdapterBase[PredictionRecord]) + extends ReadableStore[Set[Long], DataRecord] { + + override def multiGet[K <: Set[Long]](keys: Set[K]): Map[K, Future[Option[DataRecord]]] = { + val orderedKeys = keys.toSeq + val featureStoreRequests: Seq[FeatureStoreRequest] = orderedKeys.map { key: Set[Long] => + FeatureStoreRequest( + entityIds = key.map(userId => userEntity.withId(UserId(userId))).toSeq + ) + } + val predictionRecordsFut: Future[Seq[PredictionRecord]] = featureStoreClient( + featureStoreRequests) + + orderedKeys.zipWithIndex.map { + case (userId, index) => + val dataRecordFutOpt = predictionRecordsFut.map { predictionRecords => + userFeaturesAdapter.adaptToDataRecords(predictionRecords(index)).asScala.headOption + } + (userId, dataRecordFutOpt) + }.toMap + } +} diff --git a/src/scala/com/twitter/timelines/prediction/features/README.md b/src/scala/com/twitter/timelines/prediction/features/README.md new file mode 100644 index 0000000000..d42639a774 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/README.md @@ -0,0 +1,6 @@ +## Prediction Features + +This directory contains a collection of `Features` (`com.twitter.ml.api.Feature`) which are definitions of feature names and datatypes which allow the features to be efficiently processed and passed to the different ranking models. +By predefining the features with their names and datatypes, when features are being generated, scribed or used to score they can be identified with only a hash of their name. + +Not all of these features are used in the model, many are experimental or deprecated. \ No newline at end of file diff --git a/src/scala/com/twitter/timelines/prediction/features/client_log_event/BUILD b/src/scala/com/twitter/timelines/prediction/features/client_log_event/BUILD new file mode 100644 index 0000000000..3d3c34092a --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/client_log_event/BUILD @@ -0,0 +1,11 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "src/java/com/twitter/ml/api:api-base", + "src/scala/com/twitter/suggests/controller_data", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + "src/thrift/com/twitter/timelineservice/server/suggests/logging:thrift-scala", + ], +) diff --git a/src/scala/com/twitter/timelines/prediction/features/client_log_event/ClientLogEventDataRecordFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/client_log_event/ClientLogEventDataRecordFeatures.scala new file mode 100644 index 0000000000..cccb99998a --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/client_log_event/ClientLogEventDataRecordFeatures.scala @@ -0,0 +1,169 @@ +package com.twitter.timelines.prediction.features.client_log_event + +import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ +import com.twitter.ml.api.Feature +import com.twitter.ml.api.Feature.Binary +import com.twitter.ml.api.Feature.Continuous +import com.twitter.ml.api.Feature.Discrete +import scala.collection.JavaConverters._ +import com.twitter.timelineservice.suggests.logging.candidate_tweet_source_id.thriftscala.CandidateTweetSourceId + +object ClientLogEventDataRecordFeatures { + val HasConsumerVideo = new Binary( + "client_log_event.tweet.has_consumer_video", + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val PhotoCount = new Continuous( + "client_log_event.tweet.photo_count", + Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava) + val HasImage = new Binary( + "client_log_event.tweet.has_image", + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val IsReply = + new Binary("client_log_event.tweet.is_reply", Set(PublicReplies, PrivateReplies).asJava) + val IsRetweet = + new Binary("client_log_event.tweet.is_retweet", Set(PublicRetweets, PrivateRetweets).asJava) + val IsPromoted = + new Binary( + "client_log_event.tweet.is_promoted", + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HasVisibleLink = new Binary( + "client_log_event.tweet.has_visible_link", + Set(UrlFoundFlag, PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HasHashtag = new Binary( + "client_log_event.tweet.has_hashtag", + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val FromMutualFollow = new Binary("client_log_event.tweet.from_mutual_follow") + val IsInNetwork = new Binary("client_log_event.tweet.is_in_network") + val IsNotInNetwork = new Binary("client_log_event.tweet.is_not_in_network") + val FromRecap = new Binary("client_log_event.tweet.from_recap") + val FromRecycled = new Binary("client_log_event.tweet.from_recycled") + val FromActivity = new Binary("client_log_event.tweet.from_activity") + val FromSimcluster = new Binary("client_log_event.tweet.from_simcluster") + val FromErg = new Binary("client_log_event.tweet.from_erg") + val FromCroon = new Binary("client_log_event.tweet.from_croon") + val FromList = new Binary("client_log_event.tweet.from_list") + val FromRecTopic = new Binary("client_log_event.tweet.from_rec_topic") + val InjectedPosition = new Discrete("client_log_event.tweet.injectedPosition") + val TextOnly = new Binary("client_log_event.tweet.text_only") + val HasLikedBySocialContext = new Binary("client_log_event.tweet.has_liked_by_social_context") + val HasFollowedBySocialContext = new Binary( + "client_log_event.tweet.has_followed_by_social_context") + val HasTopicSocialContext = new Binary("client_log_event.tweet.has_topic_social_context") + val IsFollowedTopicTweet = new Binary("client_log_event.tweet.is_followed_topic_tweet") + val IsRecommendedTopicTweet = new Binary("client_log_event.tweet.is_recommended_topic_tweet") + val IsTweetAgeLessThan15Seconds = new Binary( + "client_log_event.tweet.tweet_age_less_than_15_seconds") + val IsTweetAgeLessThanOrEqualTo30Minutes = new Binary( + "client_log_event.tweet.tweet_age_lte_30_minutes") + val IsTweetAgeLessThanOrEqualTo1Hour = new Binary("client_log_event.tweet.tweet_age_lte_1_hour") + val IsTweetAgeLessThanOrEqualTo6Hours = new Binary("client_log_event.tweet.tweet_age_lte_6_hours") + val IsTweetAgeLessThanOrEqualTo12Hours = new Binary( + "client_log_event.tweet.tweet_age_lte_12_hours") + val IsTweetAgeGreaterThanOrEqualTo24Hours = new Binary( + "client_log_event.tweet.tweet_age_gte_24_hours") + val HasGreaterThanOrEqualTo100Favs = new Binary("client_log_event.tweet.has_gte_100_favs") + val HasGreaterThanOrEqualTo1KFavs = new Binary("client_log_event.tweet.has_gte_1k_favs") + val HasGreaterThanOrEqualTo10KFavs = new Binary("client_log_event.tweet.has_gte_10k_favs") + val HasGreaterThanOrEqualTo100KFavs = new Binary("client_log_event.tweet.has_gte_100k_favs") + val HasGreaterThanOrEqualTo10Retweets = new Binary("client_log_event.tweet.has_gte_10_retweets") + val HasGreaterThanOrEqualTo100Retweets = new Binary("client_log_event.tweet.has_gte_100_retweets") + val HasGreaterThanOrEqualTo1KRetweets = new Binary("client_log_event.tweet.has_gte_1k_retweets") + + val TweetTypeToFeatureMap: Map[String, Binary] = Map( + "link" -> HasVisibleLink, + "hashtag" -> HasHashtag, + "mutual_follow" -> FromMutualFollow, + "in_network" -> IsInNetwork, + "text_only" -> TextOnly, + "has_liked_by_social_context" -> HasLikedBySocialContext, + "has_followed_by_social_context" -> HasFollowedBySocialContext, + "has_topic_social_context" -> HasTopicSocialContext, + "is_followed_topic_tweet" -> IsFollowedTopicTweet, + "is_recommended_topic_tweet" -> IsRecommendedTopicTweet, + "tweet_age_less_than_15_seconds" -> IsTweetAgeLessThan15Seconds, + "tweet_age_lte_30_minutes" -> IsTweetAgeLessThanOrEqualTo30Minutes, + "tweet_age_lte_1_hour" -> IsTweetAgeLessThanOrEqualTo1Hour, + "tweet_age_lte_6_hours" -> IsTweetAgeLessThanOrEqualTo6Hours, + "tweet_age_lte_12_hours" -> IsTweetAgeLessThanOrEqualTo12Hours, + "tweet_age_gte_24_hours" -> IsTweetAgeGreaterThanOrEqualTo24Hours, + "has_gte_100_favs" -> HasGreaterThanOrEqualTo100Favs, + "has_gte_1k_favs" -> HasGreaterThanOrEqualTo1KFavs, + "has_gte_10k_favs" -> HasGreaterThanOrEqualTo10KFavs, + "has_gte_100k_favs" -> HasGreaterThanOrEqualTo100KFavs, + "has_gte_10_retweets" -> HasGreaterThanOrEqualTo10Retweets, + "has_gte_100_retweets" -> HasGreaterThanOrEqualTo100Retweets, + "has_gte_1k_retweets" -> HasGreaterThanOrEqualTo1KRetweets + ) + + val CandidateTweetSourceIdFeatureMap: Map[Int, Binary] = Map( + CandidateTweetSourceId.RecapTweet.value -> FromRecap, + CandidateTweetSourceId.RecycledTweet.value -> FromRecycled, + CandidateTweetSourceId.RecommendedTweet.value -> FromActivity, + CandidateTweetSourceId.Simcluster.value -> FromSimcluster, + CandidateTweetSourceId.ErgTweet.value -> FromErg, + CandidateTweetSourceId.CroonTopicTweet.value -> FromCroon, + CandidateTweetSourceId.CroonTweet.value -> FromCroon, + CandidateTweetSourceId.ListTweet.value -> FromList, + CandidateTweetSourceId.RecommendedTopicTweet.value -> FromRecTopic + ) + + val TweetFeaturesV2: Set[Feature[_]] = Set( + HasImage, + IsReply, + IsRetweet, + HasVisibleLink, + HasHashtag, + FromMutualFollow, + IsInNetwork + ) + + val ContentTweetTypeFeatures: Set[Feature[_]] = Set( + HasImage, + HasVisibleLink, + HasHashtag, + TextOnly, + HasVisibleLink + ) + + val FreshnessTweetTypeFeatures: Set[Feature[_]] = Set( + IsTweetAgeLessThan15Seconds, + IsTweetAgeLessThanOrEqualTo30Minutes, + IsTweetAgeLessThanOrEqualTo1Hour, + IsTweetAgeLessThanOrEqualTo6Hours, + IsTweetAgeLessThanOrEqualTo12Hours, + IsTweetAgeGreaterThanOrEqualTo24Hours + ) + + val SocialProofTweetTypeFeatures: Set[Feature[_]] = Set( + HasLikedBySocialContext, + HasFollowedBySocialContext, + HasTopicSocialContext + ) + + val TopicTweetPreferenceTweetTypeFeatures: Set[Feature[_]] = Set( + IsFollowedTopicTweet, + IsRecommendedTopicTweet + ) + + val TweetPopularityTweetTypeFeatures: Set[Feature[_]] = Set( + HasGreaterThanOrEqualTo100Favs, + HasGreaterThanOrEqualTo1KFavs, + HasGreaterThanOrEqualTo10KFavs, + HasGreaterThanOrEqualTo100KFavs, + HasGreaterThanOrEqualTo10Retweets, + HasGreaterThanOrEqualTo100Retweets, + HasGreaterThanOrEqualTo1KRetweets + ) + + val UserGraphInteractionTweetTypeFeatures: Set[Feature[_]] = Set( + IsInNetwork, + FromMutualFollow, + IsNotInNetwork, + IsPromoted + ) + + val UserContentPreferenceTweetTypeFeatures: Set[Feature[_]] = + ContentTweetTypeFeatures ++ FreshnessTweetTypeFeatures ++ SocialProofTweetTypeFeatures ++ TopicTweetPreferenceTweetTypeFeatures ++ TweetPopularityTweetTypeFeatures ++ UserGraphInteractionTweetTypeFeatures + val AuthorContentPreferenceTweetTypeFeatures: Set[Feature[_]] = + Set(IsInNetwork, FromMutualFollow, IsNotInNetwork) ++ ContentTweetTypeFeatures +} diff --git a/src/scala/com/twitter/timelines/prediction/features/common/BUILD b/src/scala/com/twitter/timelines/prediction/features/common/BUILD new file mode 100644 index 0000000000..bfbe764c70 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/common/BUILD @@ -0,0 +1,11 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "src/java/com/twitter/ml/api:api-base", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + "src/thrift/com/twitter/ml/api:data-java", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + ], +) diff --git a/src/scala/com/twitter/timelines/prediction/features/common/CombinedFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/common/CombinedFeatures.scala new file mode 100644 index 0000000000..d995fe2b02 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/common/CombinedFeatures.scala @@ -0,0 +1,536 @@ +package com.twitter.timelines.prediction.features.common + +import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ +import com.twitter.ml.api.Feature +import com.twitter.ml.api.FeatureType +import com.twitter.ml.api.Feature.Binary +import java.lang.{Boolean => JBoolean} +import scala.collection.JavaConverters._ + +object CombinedFeatures { + val IS_CLICKED = + new Binary("timelines.engagement.is_clicked", Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_DWELLED = + new Binary("timelines.engagement.is_dwelled", Set(TweetsViewed, EngagementsPrivate).asJava) + val IS_DWELLED_IN_BOUNDS_V1 = new Binary( + "timelines.engagement.is_dwelled_in_bounds_v1", + Set(TweetsViewed, EngagementsPrivate).asJava) + val IS_FAVORITED = new Binary( + "timelines.engagement.is_favorited", + Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava) + val IS_FOLLOWED = new Binary( + "timelines.engagement.is_followed", + Set(EngagementsPrivate, EngagementsPublic, Follow).asJava) + val IS_IMPRESSED = + new Binary("timelines.engagement.is_impressed", Set(TweetsViewed, EngagementsPrivate).asJava) + val IS_OPEN_LINKED = new Binary( + "timelines.engagement.is_open_linked", + Set(EngagementsPrivate, LinksClickedOn).asJava) + val IS_PHOTO_EXPANDED = new Binary( + "timelines.engagement.is_photo_expanded", + Set(MediaEngagementActivities, EngagementsPrivate).asJava) + val IS_PROFILE_CLICKED = new Binary( + "timelines.engagement.is_profile_clicked", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_QUOTED = new Binary( + "timelines.engagement.is_quoted", + Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) + val IS_REPLIED = new Binary( + "timelines.engagement.is_replied", + Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava) + val IS_RETWEETED = new Binary( + "timelines.engagement.is_retweeted", + Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) + val IS_RETWEETED_WITHOUT_QUOTE = new Binary( + "timelines.enagagement.is_retweeted_without_quote", + Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) + val IS_SHARE_DM_CLICKED = + new Binary("timelines.engagement.is_tweet_share_dm_clicked", Set(EngagementsPrivate).asJava) + val IS_SHARE_DM_SENT = + new Binary("timelines.engagement.is_tweet_share_dm_sent", Set(EngagementsPrivate).asJava) + val IS_VIDEO_PLAYBACK_25 = new Binary( + "timelines.engagement.is_video_playback_25", + Set(MediaEngagementActivities, EngagementsPrivate).asJava) + val IS_VIDEO_PLAYBACK_50 = new Binary( + "timelines.engagement.is_video_playback_50", + Set(MediaEngagementActivities, EngagementsPrivate).asJava) + val IS_VIDEO_PLAYBACK_75 = new Binary( + "timelines.engagement.is_video_playback_75", + Set(MediaEngagementActivities, EngagementsPrivate).asJava) + val IS_VIDEO_PLAYBACK_95 = new Binary( + "timelines.engagement.is_video_playback_95", + Set(MediaEngagementActivities, EngagementsPrivate).asJava) + val IS_VIDEO_PLAYBACK_COMPLETE = new Binary( + "timelines.engagement.is_video_playback_complete", + Set(MediaEngagementActivities, EngagementsPrivate).asJava) + val IS_VIDEO_PLAYBACK_START = new Binary( + "timelines.engagement.is_video_playback_start", + Set(MediaEngagementActivities, EngagementsPrivate).asJava) + val IS_VIDEO_VIEWED = new Binary( + "timelines.engagement.is_video_viewed", + Set(MediaEngagementActivities, EngagementsPrivate).asJava) + val IS_VIDEO_QUALITY_VIEWED = new Binary( + "timelines.engagement.is_video_quality_viewed", + Set(MediaEngagementActivities, EngagementsPrivate).asJava + ) + // v1: post click engagements: fav, reply + val IS_GOOD_CLICKED_CONVO_DESC_V1 = new Binary( + "timelines.engagement.is_good_clicked_convo_desc_favorited_or_replied", + Set( + TweetsClicked, + PublicLikes, + PrivateLikes, + PublicReplies, + PrivateReplies, + EngagementsPrivate, + EngagementsPublic).asJava) + // v2: post click engagements: click + val IS_GOOD_CLICKED_CONVO_DESC_V2 = new Binary( + "timelines.engagement.is_good_clicked_convo_desc_v2", + Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_GOOD_CLICKED_WITH_DWELL_SUM_GTE_60S = new Binary( + "timelines.engagement.is_good_clicked_convo_desc_favorited_or_replied_or_dwell_sum_gte_60_secs", + Set( + TweetsClicked, + PublicLikes, + PrivateLikes, + PublicReplies, + PrivateReplies, + EngagementsPrivate, + EngagementsPublic).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_FAVORITED = new Binary( + "timelines.engagement.is_good_clicked_convo_desc_favorited", + Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_REPLIED = new Binary( + "timelines.engagement.is_good_clicked_convo_desc_replied", + Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_RETWEETED = new Binary( + "timelines.engagement.is_good_clicked_convo_desc_retweeted", + Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_CLICKED = new Binary( + "timelines.engagement.is_good_clicked_convo_desc_clicked", + Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_FOLLOWED = new Binary( + "timelines.engagement.is_good_clicked_convo_desc_followed", + Set(EngagementsPrivate).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_SHARE_DM_CLICKED = new Binary( + "timelines.engagement.is_good_clicked_convo_desc_share_dm_clicked", + Set(EngagementsPrivate).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_PROFILE_CLICKED = new Binary( + "timelines.engagement.is_good_clicked_convo_desc_profile_clicked", + Set(EngagementsPrivate).asJava) + + val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_0 = new Binary( + "timelines.engagement.is_good_clicked_convo_desc_uam_gt_0", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_1 = new Binary( + "timelines.engagement.is_good_clicked_convo_desc_uam_gt_1", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_2 = new Binary( + "timelines.engagement.is_good_clicked_convo_desc_uam_gt_2", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_3 = new Binary( + "timelines.engagement.is_good_clicked_convo_desc_uam_gt_3", + Set(EngagementsPrivate, EngagementsPublic).asJava) + + val IS_TWEET_DETAIL_DWELLED = new Binary( + "timelines.engagement.is_tweet_detail_dwelled", + Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_TWEET_DETAIL_DWELLED_8_SEC = new Binary( + "timelines.engagement.is_tweet_detail_dwelled_8_sec", + Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_TWEET_DETAIL_DWELLED_15_SEC = new Binary( + "timelines.engagement.is_tweet_detail_dwelled_15_sec", + Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_TWEET_DETAIL_DWELLED_25_SEC = new Binary( + "timelines.engagement.is_tweet_detail_dwelled_25_sec", + Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_TWEET_DETAIL_DWELLED_30_SEC = new Binary( + "timelines.engagement.is_tweet_detail_dwelled_30_sec", + Set(TweetsClicked, EngagementsPrivate).asJava) + + val IS_PROFILE_DWELLED = new Binary( + "timelines.engagement.is_profile_dwelled", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_PROFILE_DWELLED_10_SEC = new Binary( + "timelines.engagement.is_profile_dwelled_10_sec", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_PROFILE_DWELLED_20_SEC = new Binary( + "timelines.engagement.is_profile_dwelled_20_sec", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_PROFILE_DWELLED_30_SEC = new Binary( + "timelines.engagement.is_profile_dwelled_30_sec", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + + val IS_FULLSCREEN_VIDEO_DWELLED = new Binary( + "timelines.engagement.is_fullscreen_video_dwelled", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_FULLSCREEN_VIDEO_DWELLED_5_SEC = new Binary( + "timelines.engagement.is_fullscreen_video_dwelled_5_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_FULLSCREEN_VIDEO_DWELLED_10_SEC = new Binary( + "timelines.engagement.is_fullscreen_video_dwelled_10_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_FULLSCREEN_VIDEO_DWELLED_20_SEC = new Binary( + "timelines.engagement.is_fullscreen_video_dwelled_20_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_FULLSCREEN_VIDEO_DWELLED_30_SEC = new Binary( + "timelines.engagement.is_fullscreen_video_dwelled_30_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_LINK_DWELLED_15_SEC = new Binary( + "timelines.engagement.is_link_dwelled_15_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_LINK_DWELLED_30_SEC = new Binary( + "timelines.engagement.is_link_dwelled_30_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_LINK_DWELLED_60_SEC = new Binary( + "timelines.engagement.is_link_dwelled_60_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_HOME_LATEST_VISITED = + new Binary("timelines.engagement.is_home_latest_visited", Set(EngagementsPrivate).asJava) + + val IS_BOOKMARKED = + new Binary("timelines.engagement.is_bookmarked", Set(EngagementsPrivate).asJava) + val IS_SHARED = + new Binary("timelines.engagement.is_shared", Set(EngagementsPrivate).asJava) + val IS_SHARE_MENU_CLICKED = + new Binary("timelines.engagement.is_share_menu_clicked", Set(EngagementsPrivate).asJava) + + // Negative engagements + val IS_DONT_LIKE = new Binary("timelines.engagement.is_dont_like", Set(EngagementsPrivate).asJava) + val IS_BLOCK_CLICKED = new Binary( + "timelines.engagement.is_block_clicked", + Set(Blocks, TweetsClicked, EngagementsPrivate, EngagementsPublic).asJava) + val IS_BLOCK_DIALOG_BLOCKED = new Binary( + "timelines.engagement.is_block_dialog_blocked", + Set(Blocks, EngagementsPrivate, EngagementsPublic).asJava) + val IS_MUTE_CLICKED = new Binary( + "timelines.engagement.is_mute_clicked", + Set(Mutes, TweetsClicked, EngagementsPrivate).asJava) + val IS_MUTE_DIALOG_MUTED = + new Binary("timelines.engagement.is_mute_dialog_muted", Set(Mutes, EngagementsPrivate).asJava) + val IS_REPORT_TWEET_CLICKED = new Binary( + "timelines.engagement.is_report_tweet_clicked", + Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_CARET_CLICKED = + new Binary("timelines.engagement.is_caret_clicked", Set(EngagementsPrivate).asJava) + val IS_NOT_ABOUT_TOPIC = + new Binary("timelines.engagement.is_not_about_topic", Set(EngagementsPrivate).asJava) + val IS_NOT_RECENT = + new Binary("timelines.engagement.is_not_recent", Set(EngagementsPrivate).asJava) + val IS_NOT_RELEVANT = + new Binary("timelines.engagement.is_not_relevant", Set(EngagementsPrivate).asJava) + val IS_SEE_FEWER = + new Binary("timelines.engagement.is_see_fewer", Set(EngagementsPrivate).asJava) + val IS_UNFOLLOW_TOPIC = + new Binary("timelines.engagement.is_unfollow_topic", Set(EngagementsPrivate).asJava) + val IS_FOLLOW_TOPIC = + new Binary("timelines.engagement.is_follow_topic", Set(EngagementsPrivate).asJava) + val IS_NOT_INTERESTED_IN_TOPIC = + new Binary("timelines.engagement.is_not_interested_in_topic", Set(EngagementsPrivate).asJava) + val IS_NEGATIVE_FEEDBACK = + new Binary("timelines.engagement.is_negative_feedback", Set(EngagementsPrivate).asJava) + val IS_IMPLICIT_POSITIVE_FEEDBACK_UNION = + new Binary( + "timelines.engagement.is_implicit_positive_feedback_union", + Set(EngagementsPrivate).asJava) + val IS_EXPLICIT_POSITIVE_FEEDBACK_UNION = + new Binary( + "timelines.engagement.is_explicit_positive_feedback_union", + Set(EngagementsPrivate).asJava) + val IS_ALL_NEGATIVE_FEEDBACK_UNION = + new Binary( + "timelines.engagement.is_all_negative_feedback_union", + Set(EngagementsPrivate).asJava) + // Reciprocal engagements for reply forward engagement + val IS_REPLIED_REPLY_IMPRESSED_BY_AUTHOR = new Binary( + "timelines.engagement.is_replied_reply_impressed_by_author", + Set(EngagementsPrivate).asJava) + val IS_REPLIED_REPLY_FAVORITED_BY_AUTHOR = new Binary( + "timelines.engagement.is_replied_reply_favorited_by_author", + Set(EngagementsPrivate, EngagementsPublic, PrivateLikes, PublicLikes).asJava) + val IS_REPLIED_REPLY_QUOTED_BY_AUTHOR = new Binary( + "timelines.engagement.is_replied_reply_quoted_by_author", + Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava) + val IS_REPLIED_REPLY_REPLIED_BY_AUTHOR = new Binary( + "timelines.engagement.is_replied_reply_replied_by_author", + Set(EngagementsPrivate, EngagementsPublic, PrivateReplies, PublicReplies).asJava) + val IS_REPLIED_REPLY_RETWEETED_BY_AUTHOR = new Binary( + "timelines.engagement.is_replied_reply_retweeted_by_author", + Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava) + val IS_REPLIED_REPLY_BLOCKED_BY_AUTHOR = new Binary( + "timelines.engagement.is_replied_reply_blocked_by_author", + Set(Blocks, EngagementsPrivate, EngagementsPublic).asJava) + val IS_REPLIED_REPLY_FOLLOWED_BY_AUTHOR = new Binary( + "timelines.engagement.is_replied_reply_followed_by_author", + Set(EngagementsPrivate, EngagementsPublic, Follow).asJava) + val IS_REPLIED_REPLY_UNFOLLOWED_BY_AUTHOR = new Binary( + "timelines.engagement.is_replied_reply_unfollowed_by_author", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_REPLIED_REPLY_MUTED_BY_AUTHOR = new Binary( + "timelines.engagement.is_replied_reply_muted_by_author", + Set(Mutes, EngagementsPrivate).asJava) + val IS_REPLIED_REPLY_REPORTED_BY_AUTHOR = new Binary( + "timelines.engagement.is_replied_reply_reported_by_author", + Set(EngagementsPrivate).asJava) + + // Reciprocal engagements for fav forward engagement + val IS_FAVORITED_FAV_FAVORITED_BY_AUTHOR = new Binary( + "timelines.engagement.is_favorited_fav_favorited_by_author", + Set(EngagementsPrivate, EngagementsPublic, PrivateLikes, PublicLikes).asJava + ) + val IS_FAVORITED_FAV_REPLIED_BY_AUTHOR = new Binary( + "timelines.engagement.is_favorited_fav_replied_by_author", + Set(EngagementsPrivate, EngagementsPublic, PrivateReplies, PublicReplies).asJava + ) + val IS_FAVORITED_FAV_RETWEETED_BY_AUTHOR = new Binary( + "timelines.engagement.is_favorited_fav_retweeted_by_author", + Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava + ) + val IS_FAVORITED_FAV_FOLLOWED_BY_AUTHOR = new Binary( + "timelines.engagement.is_favorited_fav_followed_by_author", + Set(EngagementsPrivate, EngagementsPublic).asJava + ) + + // define good profile click by considering following engagements (follow, fav, reply, retweet, etc.) at profile page + val IS_PROFILE_CLICKED_AND_PROFILE_FOLLOW = new Binary( + "timelines.engagement.is_profile_clicked_and_profile_follow", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, Follow).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_FAV = new Binary( + "timelines.engagement.is_profile_clicked_and_profile_fav", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, PrivateLikes, PublicLikes).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_REPLY = new Binary( + "timelines.engagement.is_profile_clicked_and_profile_reply", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, PrivateReplies, PublicReplies).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_RETWEET = new Binary( + "timelines.engagement.is_profile_clicked_and_profile_retweet", + Set( + ProfilesViewed, + ProfilesClicked, + EngagementsPrivate, + PrivateRetweets, + PublicRetweets).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_TWEET_CLICK = new Binary( + "timelines.engagement.is_profile_clicked_and_profile_tweet_click", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, TweetsClicked).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_SHARE_DM_CLICK = new Binary( + "timelines.engagement.is_profile_clicked_and_profile_share_dm_click", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + // This derived label is the union of all binary features above + val IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED = new Binary( + "timelines.engagement.is_profile_clicked_and_profile_engaged", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, EngagementsPublic).asJava) + + // define bad profile click by considering following engagements (user report, tweet report, mute, block, etc) at profile page + val IS_PROFILE_CLICKED_AND_PROFILE_USER_REPORT_CLICK = new Binary( + "timelines.engagement.is_profile_clicked_and_profile_user_report_click", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_TWEET_REPORT_CLICK = new Binary( + "timelines.engagement.is_profile_clicked_and_profile_tweet_report_click", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_MUTE = new Binary( + "timelines.engagement.is_profile_clicked_and_profile_mute", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_BLOCK = new Binary( + "timelines.engagement.is_profile_clicked_and_profile_block", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + // This derived label is the union of bad profile click engagements and existing negative feedback + val IS_NEGATIVE_FEEDBACK_V2 = new Binary( + "timelines.engagement.is_negative_feedback_v2", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_NEGATIVE_FEEDBACK_UNION = new Binary( + "timelines.engagement.is_negative_feedback_union", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + // don't like, mute or profile page -> mute + val IS_WEAK_NEGATIVE_FEEDBACK = new Binary( + "timelines.engagement.is_weak_negative_feedback", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + // report, block or profile page -> report, block + val IS_STRONG_NEGATIVE_FEEDBACK = new Binary( + "timelines.engagement.is_strong_negative_feedback", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + // engagement for following user from any surface area + val IS_FOLLOWED_FROM_ANY_SURFACE_AREA = new Binary( + "timelines.engagement.is_followed_from_any_surface_area", + Set(EngagementsPublic, EngagementsPrivate).asJava) + val IS_RELEVANCE_PROMPT_YES_CLICKED = new Binary( + "timelines.engagement.is_relevance_prompt_yes_clicked", + Set(EngagementsPublic, EngagementsPrivate).asJava) + + // Reply downvote engagements + val IS_REPLY_DOWNVOTED = + new Binary("timelines.engagement.is_reply_downvoted", Set(EngagementsPrivate).asJava) + val IS_REPLY_DOWNVOTE_REMOVED = + new Binary("timelines.engagement.is_reply_downvote_removed", Set(EngagementsPrivate).asJava) + + /** + * Contains all engagements that are used/consumed by real-time + * aggregates summingbird jobs. These engagements need to be + * extractable from [[ClientEvent]]. + */ + val EngagementsRealTime: Set[Feature[JBoolean]] = Set( + IS_CLICKED, + IS_DWELLED, + IS_FAVORITED, + IS_FOLLOWED, + IS_OPEN_LINKED, + IS_PHOTO_EXPANDED, + IS_PROFILE_CLICKED, + IS_QUOTED, + IS_REPLIED, + IS_RETWEETED, + IS_RETWEETED_WITHOUT_QUOTE, + IS_SHARE_DM_CLICKED, + IS_SHARE_DM_SENT, + IS_VIDEO_PLAYBACK_50, + IS_VIDEO_VIEWED, + IS_VIDEO_QUALITY_VIEWED + ) + + val NegativeEngagementsRealTime: Set[Feature[JBoolean]] = Set( + IS_REPORT_TWEET_CLICKED, + IS_BLOCK_CLICKED, + IS_MUTE_CLICKED + ) + + val NegativeEngagementsRealTimeDontLike: Set[Feature[JBoolean]] = Set( + IS_DONT_LIKE + ) + + val NegativeEngagementsSecondary: Set[Feature[JBoolean]] = Set( + IS_NOT_INTERESTED_IN_TOPIC, + IS_NOT_ABOUT_TOPIC, + IS_NOT_RECENT, + IS_NOT_RELEVANT, + IS_SEE_FEWER, + IS_UNFOLLOW_TOPIC + ) + + val PrivateEngagements: Set[Feature[JBoolean]] = Set( + IS_CLICKED, + IS_DWELLED, + IS_OPEN_LINKED, + IS_PHOTO_EXPANDED, + IS_PROFILE_CLICKED, + IS_QUOTED, + IS_VIDEO_PLAYBACK_50, + IS_VIDEO_QUALITY_VIEWED + ) + + val ImpressedEngagements: Set[Feature[JBoolean]] = Set( + IS_IMPRESSED + ) + + val PrivateEngagementsV2: Set[Feature[JBoolean]] = Set( + IS_CLICKED, + IS_OPEN_LINKED, + IS_PHOTO_EXPANDED, + IS_PROFILE_CLICKED, + IS_VIDEO_PLAYBACK_50, + IS_VIDEO_QUALITY_VIEWED + ) ++ ImpressedEngagements + + val CoreEngagements: Set[Feature[JBoolean]] = Set( + IS_FAVORITED, + IS_REPLIED, + IS_RETWEETED + ) + + val DwellEngagements: Set[Feature[JBoolean]] = Set( + IS_DWELLED + ) + + val PrivateCoreEngagements: Set[Feature[JBoolean]] = Set( + IS_CLICKED, + IS_OPEN_LINKED, + IS_PHOTO_EXPANDED, + IS_VIDEO_PLAYBACK_50, + IS_VIDEO_QUALITY_VIEWED + ) + + val ConditionalEngagements: Set[Feature[JBoolean]] = Set( + IS_GOOD_CLICKED_CONVO_DESC_V1, + IS_GOOD_CLICKED_CONVO_DESC_V2, + IS_GOOD_CLICKED_WITH_DWELL_SUM_GTE_60S + ) + + val ShareEngagements: Set[Feature[JBoolean]] = Set( + IS_SHARED, + IS_SHARE_MENU_CLICKED + ) + + val BookmarkEngagements: Set[Feature[JBoolean]] = Set( + IS_BOOKMARKED + ) + + val TweetDetailDwellEngagements: Set[Feature[JBoolean]] = Set( + IS_TWEET_DETAIL_DWELLED, + IS_TWEET_DETAIL_DWELLED_8_SEC, + IS_TWEET_DETAIL_DWELLED_15_SEC, + IS_TWEET_DETAIL_DWELLED_25_SEC, + IS_TWEET_DETAIL_DWELLED_30_SEC + ) + + val ProfileDwellEngagements: Set[Feature[JBoolean]] = Set( + IS_PROFILE_DWELLED, + IS_PROFILE_DWELLED_10_SEC, + IS_PROFILE_DWELLED_20_SEC, + IS_PROFILE_DWELLED_30_SEC + ) + + val FullscreenVideoDwellEngagements: Set[Feature[JBoolean]] = Set( + IS_FULLSCREEN_VIDEO_DWELLED, + IS_FULLSCREEN_VIDEO_DWELLED_5_SEC, + IS_FULLSCREEN_VIDEO_DWELLED_10_SEC, + IS_FULLSCREEN_VIDEO_DWELLED_20_SEC, + IS_FULLSCREEN_VIDEO_DWELLED_30_SEC + ) + + // Please do not add new engagements here until having estimated the impact + // to capacity requirements. User-author real-time aggregates have a very + // large key space. + val UserAuthorEngagements: Set[Feature[JBoolean]] = CoreEngagements ++ DwellEngagements ++ Set( + IS_CLICKED, + IS_PROFILE_CLICKED, + IS_PHOTO_EXPANDED, + IS_VIDEO_PLAYBACK_50, + IS_NEGATIVE_FEEDBACK_UNION + ) + + val ImplicitPositiveEngagements: Set[Feature[JBoolean]] = Set( + IS_CLICKED, + IS_DWELLED, + IS_OPEN_LINKED, + IS_PROFILE_CLICKED, + IS_QUOTED, + IS_VIDEO_PLAYBACK_50, + IS_VIDEO_QUALITY_VIEWED, + IS_TWEET_DETAIL_DWELLED, + IS_GOOD_CLICKED_CONVO_DESC_V1, + IS_GOOD_CLICKED_CONVO_DESC_V2, + IS_SHARED, + IS_SHARE_MENU_CLICKED, + IS_SHARE_DM_SENT, + IS_SHARE_DM_CLICKED + ) + + val ExplicitPositiveEngagements: Set[Feature[JBoolean]] = CoreEngagements ++ Set( + IS_FOLLOWED, + IS_QUOTED + ) + + val AllNegativeEngagements: Set[Feature[JBoolean]] = + NegativeEngagementsRealTime ++ NegativeEngagementsRealTimeDontLike ++ Set( + IS_NOT_RECENT, + IS_NOT_RELEVANT, + IS_SEE_FEWER + ) +} diff --git a/src/scala/com/twitter/timelines/prediction/features/common/NonHomeLabelFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/common/NonHomeLabelFeatures.scala new file mode 100644 index 0000000000..369b48b39a --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/common/NonHomeLabelFeatures.scala @@ -0,0 +1,97 @@ +package com.twitter.timelines.prediction.features.common + +import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ +import com.twitter.ml.api.Feature +import com.twitter.ml.api.Feature.Binary +import java.lang.{Boolean => JBoolean} +import scala.collection.JavaConverters._ + +object ProfileLabelFeatures { + private val prefix = "profile" + + val IS_CLICKED = + new Binary(s"${prefix}.engagement.is_clicked", Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_DWELLED = + new Binary(s"${prefix}.engagement.is_dwelled", Set(TweetsViewed, EngagementsPrivate).asJava) + val IS_FAVORITED = new Binary( + s"${prefix}.engagement.is_favorited", + Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava) + val IS_REPLIED = new Binary( + s"${prefix}.engagement.is_replied", + Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava) + val IS_RETWEETED = new Binary( + s"${prefix}.engagement.is_retweeted", + Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) + + // Negative engagements + val IS_DONT_LIKE = + new Binary(s"${prefix}.engagement.is_dont_like", Set(EngagementsPrivate).asJava) + val IS_BLOCK_CLICKED = new Binary( + s"${prefix}.engagement.is_block_clicked", + Set(Blocks, TweetsClicked, EngagementsPrivate, EngagementsPublic).asJava) + val IS_MUTE_CLICKED = new Binary( + s"${prefix}.engagement.is_mute_clicked", + Set(Mutes, TweetsClicked, EngagementsPrivate).asJava) + val IS_REPORT_TWEET_CLICKED = new Binary( + s"${prefix}.engagement.is_report_tweet_clicked", + Set(TweetsClicked, EngagementsPrivate).asJava) + + val IS_NEGATIVE_FEEDBACK_UNION = new Binary( + s"${prefix}.engagement.is_negative_feedback_union", + Set(EngagementsPrivate, Blocks, Mutes, TweetsClicked, EngagementsPublic).asJava) + + val CoreEngagements: Set[Feature[JBoolean]] = Set( + IS_CLICKED, + IS_DWELLED, + IS_FAVORITED, + IS_REPLIED, + IS_RETWEETED + ) + + val NegativeEngagements: Set[Feature[JBoolean]] = Set( + IS_DONT_LIKE, + IS_BLOCK_CLICKED, + IS_MUTE_CLICKED, + IS_REPORT_TWEET_CLICKED + ) + +} + +object SearchLabelFeatures { + private val prefix = "search" + + val IS_CLICKED = + new Binary(s"${prefix}.engagement.is_clicked", Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_DWELLED = + new Binary(s"${prefix}.engagement.is_dwelled", Set(TweetsViewed, EngagementsPrivate).asJava) + val IS_FAVORITED = new Binary( + s"${prefix}.engagement.is_favorited", + Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava) + val IS_REPLIED = new Binary( + s"${prefix}.engagement.is_replied", + Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava) + val IS_RETWEETED = new Binary( + s"${prefix}.engagement.is_retweeted", + Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) + val IS_PROFILE_CLICKED_SEARCH_RESULT_USER = new Binary( + s"${prefix}.engagement.is_profile_clicked_search_result_user", + Set(ProfilesClicked, ProfilesViewed, EngagementsPrivate).asJava) + val IS_PROFILE_CLICKED_SEARCH_RESULT_TWEET = new Binary( + s"${prefix}.engagement.is_profile_clicked_search_result_tweet", + Set(ProfilesClicked, ProfilesViewed, EngagementsPrivate).asJava) + val IS_PROFILE_CLICKED_TYPEAHEAD_USER = new Binary( + s"${prefix}.engagement.is_profile_clicked_typeahead_user", + Set(ProfilesClicked, ProfilesViewed, EngagementsPrivate).asJava) + + val CoreEngagements: Set[Feature[JBoolean]] = Set( + IS_CLICKED, + IS_DWELLED, + IS_FAVORITED, + IS_REPLIED, + IS_RETWEETED, + IS_PROFILE_CLICKED_SEARCH_RESULT_USER, + IS_PROFILE_CLICKED_SEARCH_RESULT_TWEET, + IS_PROFILE_CLICKED_TYPEAHEAD_USER + ) +} +// Add Tweet Detail labels later diff --git a/src/scala/com/twitter/timelines/prediction/features/common/TimelinesSharedFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/common/TimelinesSharedFeatures.scala new file mode 100644 index 0000000000..99698530f6 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/common/TimelinesSharedFeatures.scala @@ -0,0 +1,759 @@ +package com.twitter.timelines.prediction.features.common + +import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ +import com.twitter.ml.api.Feature.Binary +import com.twitter.ml.api.Feature.Continuous +import com.twitter.ml.api.Feature.Discrete +import com.twitter.ml.api.Feature.SparseBinary +import com.twitter.ml.api.Feature.SparseContinuous +import com.twitter.ml.api.Feature.Text +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup +import scala.collection.JavaConverters._ + +object TimelinesSharedFeatures extends TimelinesSharedFeatures("") +object InReplyToTweetTimelinesSharedFeatures extends TimelinesSharedFeatures("in_reply_to_tweet") + +/** + * Defines shared features + */ +class TimelinesSharedFeatures(prefix: String) { + private def name(featureName: String): String = { + if (prefix.nonEmpty) { + s"$prefix.$featureName" + } else { + featureName + } + } + + // meta + val EXPERIMENT_META = new SparseBinary( + name("timelines.meta.experiment_meta"), + Set(ExperimentId, ExperimentName).asJava) + + // historically used in the "combined models" to distinguish in-network and out of network tweets. + // now the feature denotes which adapter (recap or rectweet) was used to generate the datarecords. + // and is used by the data collection pipeline to split the training data. + val INJECTION_TYPE = new Discrete(name("timelines.meta.injection_type")) + + // Used to indicate which injection module is this + val INJECTION_MODULE_NAME = new Text(name("timelines.meta.injection_module_name")) + + val LIST_ID = new Discrete(name("timelines.meta.list_id")) + val LIST_IS_PINNED = new Binary(name("timelines.meta.list_is_pinned")) + + // internal id per each PS request. mainly to join back commomn features and candidate features later + val PREDICTION_REQUEST_ID = new Discrete(name("timelines.meta.prediction_request_id")) + // internal id per each TLM request. mainly to deduplicate re-served cached tweets in logging + val SERVED_REQUEST_ID = new Discrete(name("timelines.meta.served_request_id")) + // internal id used for join key in kafka logging, equal to servedRequestId if tweet is cached, + // else equal to predictionRequestId + val SERVED_ID = new Discrete(name("timelines.meta.served_id")) + val REQUEST_JOIN_ID = new Discrete(name("timelines.meta.request_join_id")) + + // Internal boolean flag per tweet, whether the tweet is served from RankedTweetsCache: TQ-14050 + // this feature should not be trained on, blacklisted in feature_config: D838346 + val IS_READ_FROM_CACHE = new Binary(name("timelines.meta.is_read_from_cache")) + + // model score discounts + val PHOTO_DISCOUNT = new Continuous(name("timelines.score_discounts.photo")) + val VIDEO_DISCOUNT = new Continuous(name("timelines.score_discounts.video")) + val TWEET_HEIGHT_DISCOUNT = new Continuous(name("timelines.score_discounts.tweet_height")) + val TOXICITY_DISCOUNT = new Continuous(name("timelines.score_discounts.toxicity")) + + // engagements + val ENGAGEMENT_TYPE = new Discrete(name("timelines.engagement.type")) + val PREDICTED_IS_FAVORITED = + new Continuous(name("timelines.engagement_predicted.is_favorited"), Set(EngagementScore).asJava) + val PREDICTED_IS_RETWEETED = + new Continuous(name("timelines.engagement_predicted.is_retweeted"), Set(EngagementScore).asJava) + val PREDICTED_IS_QUOTED = + new Continuous(name("timelines.engagement_predicted.is_quoted"), Set(EngagementScore).asJava) + val PREDICTED_IS_REPLIED = + new Continuous(name("timelines.engagement_predicted.is_replied"), Set(EngagementScore).asJava) + val PREDICTED_IS_OPEN_LINKED = new Continuous( + name("timelines.engagement_predicted.is_open_linked"), + Set(EngagementScore).asJava) + val PREDICTED_IS_GOOD_OPEN_LINK = new Continuous( + name("timelines.engagement_predicted.is_good_open_link"), + Set(EngagementScore).asJava) + val PREDICTED_IS_PROFILE_CLICKED = new Continuous( + name("timelines.engagement_predicted.is_profile_clicked"), + Set(EngagementScore).asJava + ) + val PREDICTED_IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED = new Continuous( + name("timelines.engagement_predicted.is_profile_clicked_and_profile_engaged"), + Set(EngagementScore).asJava + ) + val PREDICTED_IS_CLICKED = + new Continuous(name("timelines.engagement_predicted.is_clicked"), Set(EngagementScore).asJava) + val PREDICTED_IS_PHOTO_EXPANDED = new Continuous( + name("timelines.engagement_predicted.is_photo_expanded"), + Set(EngagementScore).asJava + ) + val PREDICTED_IS_FOLLOWED = + new Continuous(name("timelines.engagement_predicted.is_followed"), Set(EngagementScore).asJava) + val PREDICTED_IS_DONT_LIKE = + new Continuous(name("timelines.engagement_predicted.is_dont_like"), Set(EngagementScore).asJava) + val PREDICTED_IS_VIDEO_PLAYBACK_50 = new Continuous( + name("timelines.engagement_predicted.is_video_playback_50"), + Set(EngagementScore).asJava + ) + val PREDICTED_IS_VIDEO_QUALITY_VIEWED = new Continuous( + name("timelines.engagement_predicted.is_video_quality_viewed"), + Set(EngagementScore).asJava + ) + val PREDICTED_IS_GOOD_CLICKED_V1 = new Continuous( + name("timelines.engagement_predicted.is_good_clicked_convo_desc_favorited_or_replied"), + Set(EngagementScore).asJava) + val PREDICTED_IS_GOOD_CLICKED_V2 = new Continuous( + name("timelines.engagement_predicted.is_good_clicked_convo_desc_v2"), + Set(EngagementScore).asJava) + val PREDICTED_IS_TWEET_DETAIL_DWELLED_8_SEC = new Continuous( + name("timelines.engagement_predicted.is_tweet_detail_dwelled_8_sec"), + Set(EngagementScore).asJava) + val PREDICTED_IS_TWEET_DETAIL_DWELLED_15_SEC = new Continuous( + name("timelines.engagement_predicted.is_tweet_detail_dwelled_15_sec"), + Set(EngagementScore).asJava) + val PREDICTED_IS_TWEET_DETAIL_DWELLED_25_SEC = new Continuous( + name("timelines.engagement_predicted.is_tweet_detail_dwelled_25_sec"), + Set(EngagementScore).asJava) + val PREDICTED_IS_TWEET_DETAIL_DWELLED_30_SEC = new Continuous( + name("timelines.engagement_predicted.is_tweet_detail_dwelled_30_sec"), + Set(EngagementScore).asJava) + val PREDICTED_IS_GOOD_CLICKED_WITH_DWELL_SUM_GTE_60S = new Continuous( + name( + "timelines.engagement_predicted.is_good_clicked_convo_desc_favorited_or_replied_or_dwell_sum_gte_60_secs"), + Set(EngagementScore).asJava) + val PREDICTED_IS_FAVORITED_FAV_ENGAGED_BY_AUTHOR = new Continuous( + name("timelines.engagement_predicted.is_favorited_fav_engaged_by_author"), + Set(EngagementScore).asJava) + + val PREDICTED_IS_REPORT_TWEET_CLICKED = + new Continuous( + name("timelines.engagement_predicted.is_report_tweet_clicked"), + Set(EngagementScore).asJava) + val PREDICTED_IS_NEGATIVE_FEEDBACK = new Continuous( + name("timelines.engagement_predicted.is_negative_feedback"), + Set(EngagementScore).asJava) + val PREDICTED_IS_NEGATIVE_FEEDBACK_V2 = new Continuous( + name("timelines.engagement_predicted.is_negative_feedback_v2"), + Set(EngagementScore).asJava) + val PREDICTED_IS_WEAK_NEGATIVE_FEEDBACK = new Continuous( + name("timelines.engagement_predicted.is_weak_negative_feedback"), + Set(EngagementScore).asJava) + val PREDICTED_IS_STRONG_NEGATIVE_FEEDBACK = new Continuous( + name("timelines.engagement_predicted.is_strong_negative_feedback"), + Set(EngagementScore).asJava) + + val PREDICTED_IS_DWELLED_IN_BOUNDS_V1 = new Continuous( + name("timelines.engagement_predicted.is_dwelled_in_bounds_v1"), + Set(EngagementScore).asJava) + val PREDICTED_DWELL_NORMALIZED_OVERALL = new Continuous( + name("timelines.engagement_predicted.dwell_normalized_overall"), + Set(EngagementScore).asJava) + val PREDICTED_DWELL_CDF = + new Continuous(name("timelines.engagement_predicted.dwell_cdf"), Set(EngagementScore).asJava) + val PREDICTED_DWELL_CDF_OVERALL = new Continuous( + name("timelines.engagement_predicted.dwell_cdf_overall"), + Set(EngagementScore).asJava) + val PREDICTED_IS_DWELLED = + new Continuous(name("timelines.engagement_predicted.is_dwelled"), Set(EngagementScore).asJava) + + val PREDICTED_IS_HOME_LATEST_VISITED = new Continuous( + name("timelines.engagement_predicted.is_home_latest_visited"), + Set(EngagementScore).asJava) + + val PREDICTED_IS_BOOKMARKED = new Continuous( + name("timelines.engagement_predicted.is_bookmarked"), + Set(EngagementScore).asJava) + + val PREDICTED_IS_SHARED = + new Continuous(name("timelines.engagement_predicted.is_shared"), Set(EngagementScore).asJava) + val PREDICTED_IS_SHARE_MENU_CLICKED = new Continuous( + name("timelines.engagement_predicted.is_share_menu_clicked"), + Set(EngagementScore).asJava) + + val PREDICTED_IS_PROFILE_DWELLED_20_SEC = new Continuous( + name("timelines.engagement_predicted.is_profile_dwelled_20_sec"), + Set(EngagementScore).asJava) + + val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_5_SEC = new Continuous( + name("timelines.engagement_predicted.is_fullscreen_video_dwelled_5_sec"), + Set(EngagementScore).asJava) + val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_10_SEC = new Continuous( + name("timelines.engagement_predicted.is_fullscreen_video_dwelled_10_sec"), + Set(EngagementScore).asJava) + val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_20_SEC = new Continuous( + name("timelines.engagement_predicted.is_fullscreen_video_dwelled_20_sec"), + Set(EngagementScore).asJava) + val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_30_SEC = new Continuous( + name("timelines.engagement_predicted.is_fullscreen_video_dwelled_30_sec"), + Set(EngagementScore).asJava) + + // Please use this timestamp, not the `meta.timestamp`, for the actual served timestamp. + val SERVED_TIMESTAMP = + new Discrete("timelines.meta.timestamp.served", Set(PrivateTimestamp).asJava) + + // timestamp when the engagement has occurred. do not train on these features + val TIMESTAMP_FAVORITED = + new Discrete("timelines.meta.timestamp.engagement.favorited", Set(PublicTimestamp).asJava) + val TIMESTAMP_RETWEETED = + new Discrete("timelines.meta.timestamp.engagement.retweeted", Set(PublicTimestamp).asJava) + val TIMESTAMP_REPLIED = + new Discrete("timelines.meta.timestamp.engagement.replied", Set(PublicTimestamp).asJava) + val TIMESTAMP_PROFILE_CLICKED = new Discrete( + "timelines.meta.timestamp.engagement.profile_clicked", + Set(PrivateTimestamp).asJava) + val TIMESTAMP_CLICKED = + new Discrete("timelines.meta.timestamp.engagement.clicked", Set(PrivateTimestamp).asJava) + val TIMESTAMP_PHOTO_EXPANDED = + new Discrete("timelines.meta.timestamp.engagement.photo_expanded", Set(PrivateTimestamp).asJava) + val TIMESTAMP_DWELLED = + new Discrete("timelines.meta.timestamp.engagement.dwelled", Set(PrivateTimestamp).asJava) + val TIMESTAMP_VIDEO_PLAYBACK_50 = new Discrete( + "timelines.meta.timestamp.engagement.video_playback_50", + Set(PrivateTimestamp).asJava) + // reply engaged by author + val TIMESTAMP_REPLY_FAVORITED_BY_AUTHOR = new Discrete( + "timelines.meta.timestamp.engagement.reply_favorited_by_author", + Set(PublicTimestamp).asJava) + val TIMESTAMP_REPLY_REPLIED_BY_AUTHOR = new Discrete( + "timelines.meta.timestamp.engagement.reply_replied_by_author", + Set(PublicTimestamp).asJava) + val TIMESTAMP_REPLY_RETWEETED_BY_AUTHOR = new Discrete( + "timelines.meta.timestamp.engagement.reply_retweeted_by_author", + Set(PublicTimestamp).asJava) + // fav engaged by author + val TIMESTAMP_FAV_FAVORITED_BY_AUTHOR = new Discrete( + "timelines.meta.timestamp.engagement.fav_favorited_by_author", + Set(PublicTimestamp).asJava) + val TIMESTAMP_FAV_REPLIED_BY_AUTHOR = new Discrete( + "timelines.meta.timestamp.engagement.fav_replied_by_author", + Set(PublicTimestamp).asJava) + val TIMESTAMP_FAV_RETWEETED_BY_AUTHOR = new Discrete( + "timelines.meta.timestamp.engagement.fav_retweeted_by_author", + Set(PublicTimestamp).asJava) + val TIMESTAMP_FAV_FOLLOWED_BY_AUTHOR = new Discrete( + "timelines.meta.timestamp.engagement.fav_followed_by_author", + Set(PublicTimestamp).asJava) + // good click + val TIMESTAMP_GOOD_CLICK_CONVO_DESC_FAVORITED = new Discrete( + "timelines.meta.timestamp.engagement.good_click_convo_desc_favorited", + Set(PrivateTimestamp).asJava) + val TIMESTAMP_GOOD_CLICK_CONVO_DESC_REPLIIED = new Discrete( + "timelines.meta.timestamp.engagement.good_click_convo_desc_replied", + Set(PrivateTimestamp).asJava) + val TIMESTAMP_GOOD_CLICK_CONVO_DESC_PROFILE_CLICKED = new Discrete( + "timelines.meta.timestamp.engagement.good_click_convo_desc_profiile_clicked", + Set(PrivateTimestamp).asJava) + val TIMESTAMP_NEGATIVE_FEEDBACK = new Discrete( + "timelines.meta.timestamp.engagement.negative_feedback", + Set(PrivateTimestamp).asJava) + val TIMESTAMP_REPORT_TWEET_CLICK = + new Discrete( + "timelines.meta.timestamp.engagement.report_tweet_click", + Set(PrivateTimestamp).asJava) + val TIMESTAMP_IMPRESSED = + new Discrete("timelines.meta.timestamp.engagement.impressed", Set(PublicTimestamp).asJava) + val TIMESTAMP_TWEET_DETAIL_DWELLED = + new Discrete( + "timelines.meta.timestamp.engagement.tweet_detail_dwelled", + Set(PublicTimestamp).asJava) + val TIMESTAMP_PROFILE_DWELLED = + new Discrete("timelines.meta.timestamp.engagement.profile_dwelled", Set(PublicTimestamp).asJava) + val TIMESTAMP_FULLSCREEN_VIDEO_DWELLED = + new Discrete( + "timelines.meta.timestamp.engagement.fullscreen_video_dwelled", + Set(PublicTimestamp).asJava) + val TIMESTAMP_LINK_DWELLED = + new Discrete("timelines.meta.timestamp.engagement.link_dwelled", Set(PublicTimestamp).asJava) + + // these are used to dup and split the negative instances during streaming processing (kafka) + val TRAINING_FOR_FAVORITED = + new Binary("timelines.meta.training_data.for_favorited", Set(EngagementId).asJava) + val TRAINING_FOR_RETWEETED = + new Binary("timelines.meta.training_data.for_retweeted", Set(EngagementId).asJava) + val TRAINING_FOR_REPLIED = + new Binary("timelines.meta.training_data.for_replied", Set(EngagementId).asJava) + val TRAINING_FOR_PROFILE_CLICKED = + new Binary("timelines.meta.training_data.for_profile_clicked", Set(EngagementId).asJava) + val TRAINING_FOR_CLICKED = + new Binary("timelines.meta.training_data.for_clicked", Set(EngagementId).asJava) + val TRAINING_FOR_PHOTO_EXPANDED = + new Binary("timelines.meta.training_data.for_photo_expanded", Set(EngagementId).asJava) + val TRAINING_FOR_VIDEO_PLAYBACK_50 = + new Binary("timelines.meta.training_data.for_video_playback_50", Set(EngagementId).asJava) + val TRAINING_FOR_NEGATIVE_FEEDBACK = + new Binary("timelines.meta.training_data.for_negative_feedback", Set(EngagementId).asJava) + val TRAINING_FOR_REPORTED = + new Binary("timelines.meta.training_data.for_reported", Set(EngagementId).asJava) + val TRAINING_FOR_DWELLED = + new Binary("timelines.meta.training_data.for_dwelled", Set(EngagementId).asJava) + val TRAINING_FOR_SHARED = + new Binary("timelines.meta.training_data.for_shared", Set(EngagementId).asJava) + val TRAINING_FOR_SHARE_MENU_CLICKED = + new Binary("timelines.meta.training_data.for_share_menu_clicked", Set(EngagementId).asJava) + + // Warning: do not train on these features + val PREDICTED_SCORE = new Continuous(name("timelines.score"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_FAV = new Continuous(name("timelines.score.fav"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_RETWEET = + new Continuous(name("timelines.score.retweet"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_REPLY = + new Continuous(name("timelines.score.reply"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_OPEN_LINK = + new Continuous(name("timelines.score.open_link"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_GOOD_OPEN_LINK = + new Continuous(name("timelines.score.good_open_link"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_PROFILE_CLICK = + new Continuous(name("timelines.score.profile_click"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_DETAIL_EXPAND = + new Continuous(name("timelines.score.detail_expand"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_PHOTO_EXPAND = + new Continuous(name("timelines.score.photo_expand"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_PLAYBACK_50 = + new Continuous(name("timelines.score.playback_50"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_VIDEO_QUALITY_VIEW = + new Continuous(name("timelines.score.video_quality_view"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_DONT_LIKE = + new Continuous(name("timelines.score.dont_like"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_PROFILE_CLICKED_AND_PROFILE_ENGAGED = + new Continuous( + name("timelines.score.profile_clicked_and_profile_engaged"), + Set(EngagementScore).asJava) + val PREDICTED_SCORE_GOOD_CLICKED_V1 = + new Continuous(name("timelines.score.good_clicked_v1"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_GOOD_CLICKED_V2 = + new Continuous(name("timelines.score.good_clicked_v2"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_DWELL = + new Continuous(name("timelines.score.dwell"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_DWELL_CDF = + new Continuous(name("timelines.score.dwell_cfd"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_DWELL_CDF_OVERALL = + new Continuous(name("timelines.score.dwell_cfd_overall"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_DWELL_NORMALIZED_OVERALL = + new Continuous(name("timelines.score.dwell_normalized_overall"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_NEGATIVE_FEEDBACK = + new Continuous(name("timelines.score.negative_feedback"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_NEGATIVE_FEEDBACK_V2 = + new Continuous(name("timelines.score.negative_feedback_v2"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_WEAK_NEGATIVE_FEEDBACK = + new Continuous(name("timelines.score.weak_negative_feedback"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_STRONG_NEGATIVE_FEEDBACK = + new Continuous(name("timelines.score.strong_negative_feedback"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_REPORT_TWEET_CLICKED = + new Continuous(name("timelines.score.report_tweet_clicked"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_UNFOLLOW_TOPIC = + new Continuous(name("timelines.score.unfollow_topic"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_FOLLOW = + new Continuous(name("timelines.score.follow"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_RELEVANCE_PROMPT_YES_CLICKED = + new Continuous( + name("timelines.score.relevance_prompt_yes_clicked"), + Set(EngagementScore).asJava) + val PREDICTED_SCORE_BOOKMARK = + new Continuous(name("timelines.score.bookmark"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_SHARE = + new Continuous(name("timelines.score.share"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_SHARE_MENU_CLICK = + new Continuous(name("timelines.score.share_menu_click"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_PROFILE_DWELLED = + new Continuous(name("timelines.score.good_profile_dwelled"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_TWEET_DETAIL_DWELLED = + new Continuous(name("timelines.score.tweet_detail_dwelled"), Set(EngagementScore).asJava) + val PREDICTED_SCORE_FULLSCREEN_VIDEO_DWELL = + new Continuous(name("timelines.score.fullscreen_video_dwell"), Set(EngagementScore).asJava) + + // hydrated in TimelinesSharedFeaturesAdapter that recap adapter calls + val ORIGINAL_AUTHOR_ID = new Discrete(name("entities.original_author_id"), Set(UserId).asJava) + val SOURCE_AUTHOR_ID = new Discrete(name("entities.source_author_id"), Set(UserId).asJava) + val SOURCE_TWEET_ID = new Discrete(name("entities.source_tweet_id"), Set(TweetId).asJava) + val TOPIC_ID = new Discrete(name("entities.topic_id"), Set(SemanticcoreClassification).asJava) + val INFERRED_TOPIC_IDS = + new SparseBinary(name("entities.inferred_topic_ids"), Set(SemanticcoreClassification).asJava) + val INFERRED_TOPIC_ID = TypedAggregateGroup.sparseFeature(INFERRED_TOPIC_IDS) + + val WEIGHTED_FAV_COUNT = new Continuous( + name("timelines.earlybird.weighted_fav_count"), + Set(CountOfPrivateLikes, CountOfPublicLikes).asJava) + val WEIGHTED_RETWEET_COUNT = new Continuous( + name("timelines.earlybird.weighted_retweet_count"), + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) + val WEIGHTED_REPLY_COUNT = new Continuous( + name("timelines.earlybird.weighted_reply_count"), + Set(CountOfPrivateReplies, CountOfPublicReplies).asJava) + val WEIGHTED_QUOTE_COUNT = new Continuous( + name("timelines.earlybird.weighted_quote_count"), + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) + val EMBEDS_IMPRESSION_COUNT_V2 = new Continuous( + name("timelines.earlybird.embeds_impression_count_v2"), + Set(CountOfImpression).asJava) + val EMBEDS_URL_COUNT_V2 = new Continuous( + name("timelines.earlybird.embeds_url_count_v2"), + Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava) + val DECAYED_FAVORITE_COUNT = new Continuous( + name("timelines.earlybird.decayed_favorite_count"), + Set(CountOfPrivateLikes, CountOfPublicLikes).asJava) + val DECAYED_RETWEET_COUNT = new Continuous( + name("timelines.earlybird.decayed_retweet_count"), + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) + val DECAYED_REPLY_COUNT = new Continuous( + name("timelines.earlybird.decayed_reply_count"), + Set(CountOfPrivateReplies, CountOfPublicReplies).asJava) + val DECAYED_QUOTE_COUNT = new Continuous( + name("timelines.earlybird.decayed_quote_count"), + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) + val FAKE_FAVORITE_COUNT = new Continuous( + name("timelines.earlybird.fake_favorite_count"), + Set(CountOfPrivateLikes, CountOfPublicLikes).asJava) + val FAKE_RETWEET_COUNT = new Continuous( + name("timelines.earlybird.fake_retweet_count"), + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) + val FAKE_REPLY_COUNT = new Continuous( + name("timelines.earlybird.fake_reply_count"), + Set(CountOfPrivateReplies, CountOfPublicReplies).asJava) + val FAKE_QUOTE_COUNT = new Continuous( + name("timelines.earlybird.fake_quote_count"), + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) + val QUOTE_COUNT = new Continuous( + name("timelines.earlybird.quote_count"), + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) + + // Safety features + val LABEL_ABUSIVE_FLAG = + new Binary(name("timelines.earlybird.label_abusive_flag"), Set(TweetSafetyLabels).asJava) + val LABEL_ABUSIVE_HI_RCL_FLAG = + new Binary(name("timelines.earlybird.label_abusive_hi_rcl_flag"), Set(TweetSafetyLabels).asJava) + val LABEL_DUP_CONTENT_FLAG = + new Binary(name("timelines.earlybird.label_dup_content_flag"), Set(TweetSafetyLabels).asJava) + val LABEL_NSFW_HI_PRC_FLAG = + new Binary(name("timelines.earlybird.label_nsfw_hi_prc_flag"), Set(TweetSafetyLabels).asJava) + val LABEL_NSFW_HI_RCL_FLAG = + new Binary(name("timelines.earlybird.label_nsfw_hi_rcl_flag"), Set(TweetSafetyLabels).asJava) + val LABEL_SPAM_FLAG = + new Binary(name("timelines.earlybird.label_spam_flag"), Set(TweetSafetyLabels).asJava) + val LABEL_SPAM_HI_RCL_FLAG = + new Binary(name("timelines.earlybird.label_spam_hi_rcl_flag"), Set(TweetSafetyLabels).asJava) + + // Periscope features + val PERISCOPE_EXISTS = new Binary( + name("timelines.earlybird.periscope_exists"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val PERISCOPE_IS_LIVE = new Binary( + name("timelines.earlybird.periscope_is_live"), + Set(PrivateBroadcastMetrics, PublicBroadcastMetrics).asJava) + val PERISCOPE_HAS_BEEN_FEATURED = new Binary( + name("timelines.earlybird.periscope_has_been_featured"), + Set(PrivateBroadcastMetrics, PublicBroadcastMetrics).asJava) + val PERISCOPE_IS_CURRENTLY_FEATURED = new Binary( + name("timelines.earlybird.periscope_is_currently_featured"), + Set(PrivateBroadcastMetrics, PublicBroadcastMetrics).asJava + ) + val PERISCOPE_IS_FROM_QUALITY_SOURCE = new Binary( + name("timelines.earlybird.periscope_is_from_quality_source"), + Set(PrivateBroadcastMetrics, PublicBroadcastMetrics).asJava + ) + + val VISIBLE_TOKEN_RATIO = new Continuous(name("timelines.earlybird.visible_token_ratio")) + val HAS_QUOTE = new Binary( + name("timelines.earlybird.has_quote"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val IS_COMPOSER_SOURCE_CAMERA = new Binary( + name("timelines.earlybird.is_composer_source_camera"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + + val EARLYBIRD_SCORE = new Continuous( + name("timelines.earlybird_score"), + Set(EngagementScore).asJava + ) // separating from the rest of "timelines.earlybird." namespace + + val DWELL_TIME_MS = new Continuous( + name("timelines.engagement.dwell_time_ms"), + Set(EngagementDurationAndTimestamp, ImpressionMetadata, PrivateTimestamp).asJava) + + val TWEET_DETAIL_DWELL_TIME_MS = new Continuous( + name("timelines.engagement.tweet_detail_dwell_time_ms"), + Set(EngagementDurationAndTimestamp, ImpressionMetadata, PrivateTimestamp).asJava) + + val PROFILE_DWELL_TIME_MS = new Continuous( + name("timelines.engagement.profile_dwell_time_ms"), + Set(EngagementDurationAndTimestamp, ImpressionMetadata, PrivateTimestamp).asJava) + + val FULLSCREEN_VIDEO_DWELL_TIME_MS = new Continuous( + name("timelines.engagement.fullscreen_video_dwell_time_ms"), + Set(EngagementDurationAndTimestamp, ImpressionMetadata, PrivateTimestamp).asJava) + + val LINK_DWELL_TIME_MS = new Continuous( + name("timelines.engagement.link_dwell_time_ms"), + Set(EngagementDurationAndTimestamp, ImpressionMetadata, PrivateTimestamp).asJava) + + val ASPECT_RATIO_DEN = new Continuous( + name("tweetsource.tweet.media.aspect_ratio_den"), + Set(MediaFile, MediaProcessingInformation).asJava) + val ASPECT_RATIO_NUM = new Continuous( + name("tweetsource.tweet.media.aspect_ratio_num"), + Set(MediaFile, MediaProcessingInformation).asJava) + val BIT_RATE = new Continuous( + name("tweetsource.tweet.media.bit_rate"), + Set(MediaFile, MediaProcessingInformation).asJava) + val HEIGHT_2 = new Continuous( + name("tweetsource.tweet.media.height_2"), + Set(MediaFile, MediaProcessingInformation).asJava) + val HEIGHT_1 = new Continuous( + name("tweetsource.tweet.media.height_1"), + Set(MediaFile, MediaProcessingInformation).asJava) + val HEIGHT_3 = new Continuous( + name("tweetsource.tweet.media.height_3"), + Set(MediaFile, MediaProcessingInformation).asJava) + val HEIGHT_4 = new Continuous( + name("tweetsource.tweet.media.height_4"), + Set(MediaFile, MediaProcessingInformation).asJava) + val RESIZE_METHOD_1 = new Discrete( + name("tweetsource.tweet.media.resize_method_1"), + Set(MediaFile, MediaProcessingInformation).asJava) + val RESIZE_METHOD_2 = new Discrete( + name("tweetsource.tweet.media.resize_method_2"), + Set(MediaFile, MediaProcessingInformation).asJava) + val RESIZE_METHOD_3 = new Discrete( + name("tweetsource.tweet.media.resize_method_3"), + Set(MediaFile, MediaProcessingInformation).asJava) + val RESIZE_METHOD_4 = new Discrete( + name("tweetsource.tweet.media.resize_method_4"), + Set(MediaFile, MediaProcessingInformation).asJava) + val VIDEO_DURATION = new Continuous( + name("tweetsource.tweet.media.video_duration"), + Set(MediaFile, MediaProcessingInformation).asJava) + val WIDTH_1 = new Continuous( + name("tweetsource.tweet.media.width_1"), + Set(MediaFile, MediaProcessingInformation).asJava) + val WIDTH_2 = new Continuous( + name("tweetsource.tweet.media.width_2"), + Set(MediaFile, MediaProcessingInformation).asJava) + val WIDTH_3 = new Continuous( + name("tweetsource.tweet.media.width_3"), + Set(MediaFile, MediaProcessingInformation).asJava) + val WIDTH_4 = new Continuous( + name("tweetsource.tweet.media.width_4"), + Set(MediaFile, MediaProcessingInformation).asJava) + val NUM_MEDIA_TAGS = new Continuous( + name("tweetsource.tweet.media.num_tags"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val MEDIA_TAG_SCREEN_NAMES = new SparseBinary( + name("tweetsource.tweet.media.tag_screen_names"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val STICKER_IDS = new SparseBinary( + name("tweetsource.tweet.media.sticker_ids"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + + val NUM_COLOR_PALLETTE_ITEMS = new Continuous( + name("tweetsource.v2.tweet.media.num_color_pallette_items"), + Set(MediaFile, MediaProcessingInformation).asJava) + val COLOR_1_RED = new Continuous( + name("tweetsource.v2.tweet.media.color_1_red"), + Set(MediaFile, MediaProcessingInformation).asJava) + val COLOR_1_BLUE = new Continuous( + name("tweetsource.v2.tweet.media.color_1_blue"), + Set(MediaFile, MediaProcessingInformation).asJava) + val COLOR_1_GREEN = new Continuous( + name("tweetsource.v2.tweet.media.color_1_green"), + Set(MediaFile, MediaProcessingInformation).asJava) + val COLOR_1_PERCENTAGE = new Continuous( + name("tweetsource.v2.tweet.media.color_1_percentage"), + Set(MediaFile, MediaProcessingInformation).asJava) + val MEDIA_PROVIDERS = new SparseBinary( + name("tweetsource.v2.tweet.media.providers"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val IS_360 = new Binary( + name("tweetsource.v2.tweet.media.is_360"), + Set(MediaFile, MediaProcessingInformation).asJava) + val VIEW_COUNT = + new Continuous(name("tweetsource.v2.tweet.media.view_count"), Set(MediaContentMetrics).asJava) + val IS_MANAGED = new Binary( + name("tweetsource.v2.tweet.media.is_managed"), + Set(MediaFile, MediaProcessingInformation).asJava) + val IS_MONETIZABLE = new Binary( + name("tweetsource.v2.tweet.media.is_monetizable"), + Set(MediaFile, MediaProcessingInformation).asJava) + val IS_EMBEDDABLE = new Binary( + name("tweetsource.v2.tweet.media.is_embeddable"), + Set(MediaFile, MediaProcessingInformation).asJava) + val CLASSIFICATION_LABELS = new SparseContinuous( + name("tweetsource.v2.tweet.media.classification_labels"), + Set(MediaFile, MediaProcessingInformation).asJava) + + val NUM_STICKERS = new Continuous( + name("tweetsource.v2.tweet.media.num_stickers"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val NUM_FACES = new Continuous( + name("tweetsource.v2.tweet.media.num_faces"), + Set(MediaFile, MediaProcessingInformation).asJava) + val FACE_AREAS = new Continuous( + name("tweetsource.v2.tweet.media.face_areas"), + Set(MediaFile, MediaProcessingInformation).asJava) + val HAS_SELECTED_PREVIEW_IMAGE = new Binary( + name("tweetsource.v2.tweet.media.has_selected_preview_image"), + Set(MediaFile, MediaProcessingInformation).asJava) + val HAS_TITLE = new Binary( + name("tweetsource.v2.tweet.media.has_title"), + Set(MediaFile, MediaProcessingInformation).asJava) + val HAS_DESCRIPTION = new Binary( + name("tweetsource.v2.tweet.media.has_description"), + Set(MediaFile, MediaProcessingInformation).asJava) + val HAS_VISIT_SITE_CALL_TO_ACTION = new Binary( + name("tweetsource.v2.tweet.media.has_visit_site_call_to_action"), + Set(MediaFile, MediaProcessingInformation).asJava) + val HAS_APP_INSTALL_CALL_TO_ACTION = new Binary( + name("tweetsource.v2.tweet.media.has_app_install_call_to_action"), + Set(MediaFile, MediaProcessingInformation).asJava) + val HAS_WATCH_NOW_CALL_TO_ACTION = new Binary( + name("tweetsource.v2.tweet.media.has_watch_now_call_to_action"), + Set(MediaFile, MediaProcessingInformation).asJava) + + val NUM_CAPS = + new Continuous(name("tweetsource.tweet.text.num_caps"), Set(PublicTweets, PrivateTweets).asJava) + val TWEET_LENGTH = + new Continuous(name("tweetsource.tweet.text.length"), Set(PublicTweets, PrivateTweets).asJava) + val TWEET_LENGTH_TYPE = new Discrete( + name("tweetsource.tweet.text.length_type"), + Set(PublicTweets, PrivateTweets).asJava) + val NUM_WHITESPACES = new Continuous( + name("tweetsource.tweet.text.num_whitespaces"), + Set(PublicTweets, PrivateTweets).asJava) + val HAS_QUESTION = + new Binary(name("tweetsource.tweet.text.has_question"), Set(PublicTweets, PrivateTweets).asJava) + val NUM_NEWLINES = new Continuous( + name("tweetsource.tweet.text.num_newlines"), + Set(PublicTweets, PrivateTweets).asJava) + val EMOJI_TOKENS = new SparseBinary( + name("tweetsource.v3.tweet.text.emoji_tokens"), + Set(PublicTweets, PrivateTweets).asJava) + val EMOTICON_TOKENS = new SparseBinary( + name("tweetsource.v3.tweet.text.emoticon_tokens"), + Set(PublicTweets, PrivateTweets).asJava) + val NUM_EMOJIS = new Continuous( + name("tweetsource.v3.tweet.text.num_emojis"), + Set(PublicTweets, PrivateTweets).asJava) + val NUM_EMOTICONS = new Continuous( + name("tweetsource.v3.tweet.text.num_emoticons"), + Set(PublicTweets, PrivateTweets).asJava) + val POS_UNIGRAMS = new SparseBinary( + name("tweetsource.v3.tweet.text.pos_unigrams"), + Set(PublicTweets, PrivateTweets).asJava) + val POS_BIGRAMS = new SparseBinary( + name("tweetsource.v3.tweet.text.pos_bigrams"), + Set(PublicTweets, PrivateTweets).asJava) + val TEXT_TOKENS = new SparseBinary( + name("tweetsource.v4.tweet.text.tokens"), + Set(PublicTweets, PrivateTweets).asJava) + + // Health features model scores (see go/toxicity, go/pblock, go/pspammytweet) + val PBLOCK_SCORE = + new Continuous(name("timelines.earlybird.pblock_score"), Set(TweetSafetyScores).asJava) + val TOXICITY_SCORE = + new Continuous(name("timelines.earlybird.toxicity_score"), Set(TweetSafetyScores).asJava) + val EXPERIMENTAL_HEALTH_MODEL_SCORE_1 = + new Continuous( + name("timelines.earlybird.experimental_health_model_score_1"), + Set(TweetSafetyScores).asJava) + val EXPERIMENTAL_HEALTH_MODEL_SCORE_2 = + new Continuous( + name("timelines.earlybird.experimental_health_model_score_2"), + Set(TweetSafetyScores).asJava) + val EXPERIMENTAL_HEALTH_MODEL_SCORE_3 = + new Continuous( + name("timelines.earlybird.experimental_health_model_score_3"), + Set(TweetSafetyScores).asJava) + val EXPERIMENTAL_HEALTH_MODEL_SCORE_4 = + new Continuous( + name("timelines.earlybird.experimental_health_model_score_4"), + Set(TweetSafetyScores).asJava) + val PSPAMMY_TWEET_SCORE = + new Continuous(name("timelines.earlybird.pspammy_tweet_score"), Set(TweetSafetyScores).asJava) + val PREPORTED_TWEET_SCORE = + new Continuous(name("timelines.earlybird.preported_tweet_score"), Set(TweetSafetyScores).asJava) + + // where record was displayed e.g. recap vs ranked timeline vs recycled + // (do NOT use for training in prediction, since this is set post-scoring) + // This differs from TimelinesSharedFeatures.INJECTION_TYPE, which is only + // set to Recap or Rectweet, and is available pre-scoring. + // This also differs from TimeFeatures.IS_TWEET_RECYCLED, which is set + // pre-scoring and indicates if a tweet is being considered for recycling. + // In contrast, DISPLAY_SUGGEST_TYPE == RecycledTweet means the tweet + // was actually served in a recycled tweet module. The two should currently + // have the same value, but need not in future, so please only use + // IS_TWEET_RECYCLED/CANDIDATE_TWEET_SOURCE_ID for training models and + // only use DISPLAY_SUGGEST_TYPE for offline analysis of tweets actually + // served in recycled modules. + val DISPLAY_SUGGEST_TYPE = new Discrete(name("recap.display.suggest_type")) + + // Candidate tweet source id - related to DISPLAY_SUGGEST_TYPE above, but this is a + // property of the candidate rather than display location so is safe to use + // in model training, unlike DISPLAY_SUGGEST_TYPE. + val CANDIDATE_TWEET_SOURCE_ID = + new Discrete(name("timelines.meta.candidate_tweet_source_id"), Set(TweetId).asJava) + + // Was at least 50% of this tweet in the user's viewport for at least 500 ms, + // OR did the user engage with the tweet publicly or privately + val IS_LINGER_IMPRESSION = + new Binary(name("timelines.engagement.is_linger_impression"), Set(EngagementsPrivate).asJava) + + // Features to create rollups + val LANGUAGE_GROUP = new Discrete(name("timelines.tweet.text.language_group")) + + // The final position index of the tweet being trained on in the timeline + // served from TLM (could still change later in TLS-API), as recorded by + // PositionIndexLoggingEnvelopeTransform. + val FINAL_POSITION_INDEX = new Discrete(name("timelines.display.final_position_index")) + + // The traceId of the timeline request, can be used to group tweets in the same response. + val TRACE_ID = new Discrete(name("timelines.display.trace_id"), Set(TfeTransactionId).asJava) + + // Whether this tweet was randomly injected into the timeline or not, for exploration purposes + val IS_RANDOM_TWEET = new Binary(name("timelines.display.is_random_tweet")) + + // Whether this tweet was reordered with softmax ranking for explore/exploit, and needs to + // be excluded from exploit only holdback + val IS_SOFTMAX_RANKING_TWEET = new Binary(name("timelines.display.is_softmax_ranking_tweet")) + + // Whether the user viewing the tweet has disabled ranked timeline. + val IS_RANKED_TIMELINE_DISABLER = new Binary( + name("timelines.user_features.is_ranked_timeline_disabler"), + Set(AnnotationValue, GeneralSettings).asJava) + + // Whether the user viewing the tweet was one of those released from DDG 4205 control + // as part of http://go/shrink-4205 process to shrink the quality features holdback. + val IS_USER_RELEASED_FROM_QUALITY_HOLDBACK = new Binary( + name("timelines.user_features.is_released_from_quality_holdback"), + Set(ExperimentId, ExperimentName).asJava) + + val INITIAL_PREDICTION_FAV = + new Continuous(name("timelines.initial_prediction.fav"), Set(EngagementScore).asJava) + val INITIAL_PREDICTION_RETWEET = + new Continuous(name("timelines.initial_prediction.retweet"), Set(EngagementScore).asJava) + val INITIAL_PREDICTION_REPLY = + new Continuous(name("timelines.initial_prediction.reply"), Set(EngagementScore).asJava) + val INITIAL_PREDICTION_OPEN_LINK = + new Continuous(name("timelines.initial_prediction.open_link"), Set(EngagementScore).asJava) + val INITIAL_PREDICTION_PROFILE_CLICK = + new Continuous(name("timelines.initial_prediction.profile_click"), Set(EngagementScore).asJava) + val INITIAL_PREDICTION_VIDEO_PLAYBACK_50 = new Continuous( + name("timelines.initial_prediction.video_playback_50"), + Set(EngagementScore).asJava) + val INITIAL_PREDICTION_DETAIL_EXPAND = + new Continuous(name("timelines.initial_prediction.detail_expand"), Set(EngagementScore).asJava) + val INITIAL_PREDICTION_PHOTO_EXPAND = + new Continuous(name("timelines.initial_prediction.photo_expand"), Set(EngagementScore).asJava) + + val VIEWER_FOLLOWS_ORIGINAL_AUTHOR = + new Binary(name("timelines.viewer_follows_original_author"), Set(Follow).asJava) + + val IS_TOP_ONE = new Binary(name("timelines.position.is_top_one")) + val IS_TOP_FIVE = + new Binary(name(featureName = "timelines.position.is_top_five")) + val IS_TOP_TEN = + new Binary(name(featureName = "timelines.position.is_top_ten")) + + val LOG_POSITION = + new Continuous(name(featureName = "timelines.position.log_10")) + +} diff --git a/src/scala/com/twitter/timelines/prediction/features/engagement_features/BUILD b/src/scala/com/twitter/timelines/prediction/features/engagement_features/BUILD new file mode 100644 index 0000000000..f6caadea01 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/engagement_features/BUILD @@ -0,0 +1,12 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "src/java/com/twitter/ml/api:api-base", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + "src/thrift/com/twitter/timelineservice/server/suggests/features/engagement_features:thrift-scala", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + "timelines/data_processing/ml_util/transforms", + ], +) diff --git a/src/scala/com/twitter/timelines/prediction/features/engagement_features/EngagementFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/engagement_features/EngagementFeatures.scala new file mode 100644 index 0000000000..e65c9db202 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/engagement_features/EngagementFeatures.scala @@ -0,0 +1,246 @@ +package com.twitter.timelines.prediction.features.engagement_features + +import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ +import com.twitter.logging.Logger +import com.twitter.ml.api.DataRecord +import com.twitter.ml.api.Feature +import com.twitter.ml.api.Feature.Continuous +import com.twitter.ml.api.Feature.SparseBinary +import com.twitter.timelines.data_processing.ml_util.transforms.OneToSomeTransform +import com.twitter.timelines.data_processing.ml_util.transforms.RichITransform +import com.twitter.timelines.data_processing.ml_util.transforms.SparseBinaryUnion +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup +import com.twitter.timelineservice.suggests.features.engagement_features.thriftscala.{ + EngagementFeatures => ThriftEngagementFeatures +} +import com.twitter.timelineservice.suggests.features.engagement_features.v1.thriftscala.{ + EngagementFeatures => ThriftEngagementFeaturesV1 +} +import scala.collection.JavaConverters._ + +object EngagementFeatures { + private[this] val logger = Logger.get(getClass.getSimpleName) + + sealed trait EngagementFeature + case object Count extends EngagementFeature + case object RealGraphWeightAverage extends EngagementFeature + case object RealGraphWeightMax extends EngagementFeature + case object RealGraphWeightMin extends EngagementFeature + case object RealGraphWeightMissing extends EngagementFeature + case object RealGraphWeightVariance extends EngagementFeature + case object UserIds extends EngagementFeature + + def fromThrift(thriftEngagementFeatures: ThriftEngagementFeatures): Option[EngagementFeatures] = { + thriftEngagementFeatures match { + case thriftEngagementFeaturesV1: ThriftEngagementFeatures.V1 => + Some( + EngagementFeatures( + favoritedBy = thriftEngagementFeaturesV1.v1.favoritedBy, + retweetedBy = thriftEngagementFeaturesV1.v1.retweetedBy, + repliedBy = thriftEngagementFeaturesV1.v1.repliedBy, + ) + ) + case _ => { + logger.error("Unexpected EngagementFeatures version found.") + None + } + } + } + + val empty: EngagementFeatures = EngagementFeatures() +} + +/** + * Contains user IDs who have engaged with a target entity, such as a Tweet, + * and any additional data needed for derived features. + */ +case class EngagementFeatures( + favoritedBy: Seq[Long] = Nil, + retweetedBy: Seq[Long] = Nil, + repliedBy: Seq[Long] = Nil, + realGraphWeightByUser: Map[Long, Double] = Map.empty) { + def isEmpty: Boolean = favoritedBy.isEmpty && retweetedBy.isEmpty && repliedBy.isEmpty + def nonEmpty: Boolean = !isEmpty + def toLogThrift: ThriftEngagementFeatures.V1 = + ThriftEngagementFeatures.V1( + ThriftEngagementFeaturesV1( + favoritedBy = favoritedBy, + retweetedBy = retweetedBy, + repliedBy = repliedBy + ) + ) +} + +/** + * Represents engagement features derived from the Real Graph weight. + * + * These features are from the perspective of the source user, who is viewing their + * timeline, to the destination users (or user), who created engagements. + * + * @param count number of engagements present + * @param max max score of the engaging users + * @param mean average score of the engaging users + * @param min minimum score of the engaging users + * @param missing for engagements present, how many Real Graph scores were missing + * @param variance variance of scores of the engaging users + */ +case class RealGraphDerivedEngagementFeatures( + count: Int, + max: Double, + mean: Double, + min: Double, + missing: Int, + variance: Double) + +object EngagementDataRecordFeatures { + import EngagementFeatures._ + + val FavoritedByUserIds = new SparseBinary( + "engagement_features.user_ids.favorited_by", + Set(UserId, PrivateLikes, PublicLikes).asJava) + val RetweetedByUserIds = new SparseBinary( + "engagement_features.user_ids.retweeted_by", + Set(UserId, PrivateRetweets, PublicRetweets).asJava) + val RepliedByUserIds = new SparseBinary( + "engagement_features.user_ids.replied_by", + Set(UserId, PrivateReplies, PublicReplies).asJava) + + val InNetworkFavoritesCount = new Continuous( + "engagement_features.in_network.favorites.count", + Set(CountOfPrivateLikes, CountOfPublicLikes).asJava) + val InNetworkRetweetsCount = new Continuous( + "engagement_features.in_network.retweets.count", + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) + val InNetworkRepliesCount = new Continuous( + "engagement_features.in_network.replies.count", + Set(CountOfPrivateReplies, CountOfPublicReplies).asJava) + + // real graph derived features + val InNetworkFavoritesAvgRealGraphWeight = new Continuous( + "engagement_features.real_graph.favorites.avg_weight", + Set(CountOfPrivateLikes, CountOfPublicLikes).asJava + ) + val InNetworkFavoritesMaxRealGraphWeight = new Continuous( + "engagement_features.real_graph.favorites.max_weight", + Set(CountOfPrivateLikes, CountOfPublicLikes).asJava + ) + val InNetworkFavoritesMinRealGraphWeight = new Continuous( + "engagement_features.real_graph.favorites.min_weight", + Set(CountOfPrivateLikes, CountOfPublicLikes).asJava + ) + val InNetworkFavoritesRealGraphWeightMissing = new Continuous( + "engagement_features.real_graph.favorites.missing" + ) + val InNetworkFavoritesRealGraphWeightVariance = new Continuous( + "engagement_features.real_graph.favorites.weight_variance" + ) + + val InNetworkRetweetsMaxRealGraphWeight = new Continuous( + "engagement_features.real_graph.retweets.max_weight", + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava + ) + val InNetworkRetweetsMinRealGraphWeight = new Continuous( + "engagement_features.real_graph.retweets.min_weight", + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava + ) + val InNetworkRetweetsAvgRealGraphWeight = new Continuous( + "engagement_features.real_graph.retweets.avg_weight", + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava + ) + val InNetworkRetweetsRealGraphWeightMissing = new Continuous( + "engagement_features.real_graph.retweets.missing" + ) + val InNetworkRetweetsRealGraphWeightVariance = new Continuous( + "engagement_features.real_graph.retweets.weight_variance" + ) + + val InNetworkRepliesMaxRealGraphWeight = new Continuous( + "engagement_features.real_graph.replies.max_weight", + Set(CountOfPrivateReplies, CountOfPublicReplies).asJava + ) + val InNetworkRepliesMinRealGraphWeight = new Continuous( + "engagement_features.real_graph.replies.min_weight", + Set(CountOfPrivateReplies, CountOfPublicReplies).asJava + ) + val InNetworkRepliesAvgRealGraphWeight = new Continuous( + "engagement_features.real_graph.replies.avg_weight", + Set(CountOfPrivateReplies, CountOfPublicReplies).asJava + ) + val InNetworkRepliesRealGraphWeightMissing = new Continuous( + "engagement_features.real_graph.replies.missing" + ) + val InNetworkRepliesRealGraphWeightVariance = new Continuous( + "engagement_features.real_graph.replies.weight_variance" + ) + + sealed trait FeatureGroup { + def continuousFeatures: Map[EngagementFeature, Continuous] + def sparseBinaryFeatures: Map[EngagementFeature, SparseBinary] + def allFeatures: Seq[Feature[_]] = + (continuousFeatures.values ++ sparseBinaryFeatures.values).toSeq + } + + case object Favorites extends FeatureGroup { + override val continuousFeatures: Map[EngagementFeature, Continuous] = + Map( + Count -> InNetworkFavoritesCount, + RealGraphWeightAverage -> InNetworkFavoritesAvgRealGraphWeight, + RealGraphWeightMax -> InNetworkFavoritesMaxRealGraphWeight, + RealGraphWeightMin -> InNetworkFavoritesMinRealGraphWeight, + RealGraphWeightMissing -> InNetworkFavoritesRealGraphWeightMissing, + RealGraphWeightVariance -> InNetworkFavoritesRealGraphWeightVariance + ) + + override val sparseBinaryFeatures: Map[EngagementFeature, SparseBinary] = + Map(UserIds -> FavoritedByUserIds) + } + + case object Retweets extends FeatureGroup { + override val continuousFeatures: Map[EngagementFeature, Continuous] = + Map( + Count -> InNetworkRetweetsCount, + RealGraphWeightAverage -> InNetworkRetweetsAvgRealGraphWeight, + RealGraphWeightMax -> InNetworkRetweetsMaxRealGraphWeight, + RealGraphWeightMin -> InNetworkRetweetsMinRealGraphWeight, + RealGraphWeightMissing -> InNetworkRetweetsRealGraphWeightMissing, + RealGraphWeightVariance -> InNetworkRetweetsRealGraphWeightVariance + ) + + override val sparseBinaryFeatures: Map[EngagementFeature, SparseBinary] = + Map(UserIds -> RetweetedByUserIds) + } + + case object Replies extends FeatureGroup { + override val continuousFeatures: Map[EngagementFeature, Continuous] = + Map( + Count -> InNetworkRepliesCount, + RealGraphWeightAverage -> InNetworkRepliesAvgRealGraphWeight, + RealGraphWeightMax -> InNetworkRepliesMaxRealGraphWeight, + RealGraphWeightMin -> InNetworkRepliesMinRealGraphWeight, + RealGraphWeightMissing -> InNetworkRepliesRealGraphWeightMissing, + RealGraphWeightVariance -> InNetworkRepliesRealGraphWeightVariance + ) + + override val sparseBinaryFeatures: Map[EngagementFeature, SparseBinary] = + Map(UserIds -> RepliedByUserIds) + } + + val PublicEngagerSets = Set(FavoritedByUserIds, RetweetedByUserIds, RepliedByUserIds) + val PublicEngagementUserIds = new SparseBinary( + "engagement_features.user_ids.public", + Set(UserId, EngagementsPublic).asJava + ) + val ENGAGER_ID = TypedAggregateGroup.sparseFeature(PublicEngagementUserIds) + + val UnifyPublicEngagersTransform = SparseBinaryUnion( + featuresToUnify = PublicEngagerSets, + outputFeature = PublicEngagementUserIds + ) + + object RichUnifyPublicEngagersTransform extends OneToSomeTransform { + override def apply(dataRecord: DataRecord): Option[DataRecord] = + RichITransform(EngagementDataRecordFeatures.UnifyPublicEngagersTransform)(dataRecord) + override def featuresToTransform: Set[Feature[_]] = + EngagementDataRecordFeatures.UnifyPublicEngagersTransform.featuresToUnify.toSet + } +} diff --git a/src/scala/com/twitter/timelines/prediction/features/escherbird/BUILD b/src/scala/com/twitter/timelines/prediction/features/escherbird/BUILD new file mode 100644 index 0000000000..c28786b77e --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/escherbird/BUILD @@ -0,0 +1,19 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "src/java/com/twitter/ml/api:api-base", + "src/thrift/com/twitter/tweetypie:tweet-scala", + ], +) + +scala_library( + name = "escherbird-features", + sources = ["EscherbirdFeatures.scala"], + tags = ["bazel-only"], + dependencies = [ + "src/java/com/twitter/ml/api:api-base", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + ], +) diff --git a/src/scala/com/twitter/timelines/prediction/features/escherbird/EscherbirdFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/escherbird/EscherbirdFeatures.scala new file mode 100644 index 0000000000..3aaf9b8561 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/escherbird/EscherbirdFeatures.scala @@ -0,0 +1,19 @@ +package com.twitter.timelines.prediction.features.escherbird + +import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ +import com.twitter.ml.api.Feature +import java.util.{Set => JSet} +import scala.collection.JavaConverters._ + +object EscherbirdFeatures { + val TweetGroupIds = new Feature.SparseBinary("escherbird.tweet_group_ids") + val TweetDomainIds = new Feature.SparseBinary("escherbird.tweet_domain_ids", Set(DomainId).asJava) + val TweetEntityIds = + new Feature.SparseBinary("escherbird.tweet_entity_ids", Set(SemanticcoreClassification).asJava) +} + +case class EscherbirdFeatures( + tweetId: Long, + tweetGroupIds: JSet[String], + tweetDomainIds: JSet[String], + tweetEntityIds: JSet[String]) diff --git a/src/scala/com/twitter/timelines/prediction/features/escherbird/EscherbirdFeaturesConverter.scala b/src/scala/com/twitter/timelines/prediction/features/escherbird/EscherbirdFeaturesConverter.scala new file mode 100644 index 0000000000..bd3333a030 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/escherbird/EscherbirdFeaturesConverter.scala @@ -0,0 +1,19 @@ +package com.twitter.timelines.prediction.features.escherbird + +import com.twitter.tweetypie.thriftscala.Tweet +import scala.collection.JavaConverters._ + +object EscherbirdFeaturesConverter { + val DeprecatedOrTestDomains = Set(1L, 5L, 7L, 9L, 14L, 19L, 20L, 31L) + + def fromTweet(tweet: Tweet): Option[EscherbirdFeatures] = tweet.escherbirdEntityAnnotations.map { + escherbirdEntityAnnotations => + val annotations = escherbirdEntityAnnotations.entityAnnotations + .filterNot(annotation => DeprecatedOrTestDomains.contains(annotation.domainId)) + val tweetGroupIds = annotations.map(_.groupId.toString).toSet.asJava + val tweetDomainIds = annotations.map(_.domainId.toString).toSet.asJava + // An entity is only unique within a given domain + val tweetEntityIds = annotations.map(a => s"${a.domainId}.${a.entityId}").toSet.asJava + EscherbirdFeatures(tweet.id, tweetGroupIds, tweetDomainIds, tweetEntityIds) + } +} diff --git a/src/scala/com/twitter/timelines/prediction/features/followsource/BUILD.bazel b/src/scala/com/twitter/timelines/prediction/features/followsource/BUILD.bazel new file mode 100644 index 0000000000..0ee33acdbf --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/followsource/BUILD.bazel @@ -0,0 +1,7 @@ +scala_library( + sources = ["*.scala"], + dependencies = [ + "src/java/com/twitter/ml/api:api-base", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + ], +) diff --git a/src/scala/com/twitter/timelines/prediction/features/followsource/FollowSourceFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/followsource/FollowSourceFeatures.scala new file mode 100644 index 0000000000..012103b143 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/followsource/FollowSourceFeatures.scala @@ -0,0 +1,53 @@ +package com.twitter.timelines.prediction.features.followsource + +import com.twitter.ml.api.Feature +import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ +import scala.collection.JavaConverters._ + +object FollowSourceFeatures { + + // Corresponds to an algorithm constant from com.twitter.hermit.profile.HermitProfileConstants + val FollowSourceAlgorithm = new Feature.Text("follow_source.algorithm") + + // Type of follow action: one of "unfollow", "follow", "follow_back", "follow_many", "follow_all" + val FollowAction = new Feature.Text( + "follow_source.action", + Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) + + // Millisecond timestamp when follow occurred + val FollowTimestamp = + new Feature.Discrete("follow_source.follow_timestamp", Set(Follow, PrivateTimestamp).asJava) + + // Age of follow (in minutes) + val FollowAgeMinutes = + new Feature.Continuous("follow_source.follow_age_minutes", Set(Follow).asJava) + + // Tweet ID of tweet details page from where follow happened (if applicable) + val FollowCauseTweetId = new Feature.Discrete("follow_source.cause_tweet_id", Set(TweetId).asJava) + + // String representation of follow client (android, web, iphone, etc). Derived from "client" + // portion of client event namespace. + val FollowClientId = new Feature.Text("follow_source.client_id", Set(ClientType).asJava) + + // If the follow happens via a profile's Following or Followers, + // the id of the profile owner is recorded here. + val FollowAssociationId = + new Feature.Discrete("follow_source.association_id", Set(Follow, UserId).asJava) + + // The "friendly name" here is computed using FollowSourceUtil.getSource. It represents + // a grouping on a few client events that reflect where the event occurred. For example, + // events on the tweet details page are grouped using "tweetDetails": + // case (Some("web"), Some("permalink"), _, _, _) => "tweetDetails" + // case (Some("iphone"), Some("tweet"), _, _, _) => "tweetDetails" + // case (Some("android"), Some("tweet"), _, _, _) => "tweetDetails" + val FollowSourceFriendlyName = new Feature.Text("follow_source.friendly_name", Set(Follow).asJava) + + // Up to two sources and actions that preceded the follow (for example, a profile visit + // through a mention click, which itself was on a tweet detail page reached through a tweet + // click in the Home tab). See go/followsource for more details and examples. + // The "source" here is computed using FollowSourceUtil.getSource + val PreFollowAction1 = new Feature.Text("follow_source.pre_follow_action_1", Set(Follow).asJava) + val PreFollowAction2 = new Feature.Text("follow_source.pre_follow_action_2", Set(Follow).asJava) + val PreFollowSource1 = new Feature.Text("follow_source.pre_follow_source_1", Set(Follow).asJava) + val PreFollowSource2 = new Feature.Text("follow_source.pre_follow_source_2", Set(Follow).asJava) +} diff --git a/src/scala/com/twitter/timelines/prediction/features/itl/BUILD b/src/scala/com/twitter/timelines/prediction/features/itl/BUILD new file mode 100644 index 0000000000..6fc497bf3d --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/itl/BUILD @@ -0,0 +1,9 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "src/java/com/twitter/ml/api:api-base", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + ], +) diff --git a/src/scala/com/twitter/timelines/prediction/features/itl/ITLFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/itl/ITLFeatures.scala new file mode 100644 index 0000000000..3351e5c119 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/itl/ITLFeatures.scala @@ -0,0 +1,575 @@ +package com.twitter.timelines.prediction.features.itl + +import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ +import com.twitter.ml.api.Feature.Binary +import com.twitter.ml.api.Feature.Continuous +import com.twitter.ml.api.Feature.Discrete +import com.twitter.ml.api.Feature.SparseBinary +import scala.collection.JavaConverters._ + +object ITLFeatures { + // engagement + val IS_RETWEETED = + new Binary("itl.engagement.is_retweeted", Set(PublicRetweets, PrivateRetweets).asJava) + val IS_FAVORITED = + new Binary("itl.engagement.is_favorited", Set(PublicLikes, PrivateLikes).asJava) + val IS_REPLIED = + new Binary("itl.engagement.is_replied", Set(PublicReplies, PrivateReplies).asJava) + // v1: post click engagements: fav, reply + val IS_GOOD_CLICKED_CONVO_DESC_V1 = new Binary( + "itl.engagement.is_good_clicked_convo_desc_favorited_or_replied", + Set( + PublicLikes, + PrivateLikes, + PublicReplies, + PrivateReplies, + EngagementsPrivate, + EngagementsPublic).asJava) + // v2: post click engagements: click + val IS_GOOD_CLICKED_CONVO_DESC_V2 = new Binary( + "itl.engagement.is_good_clicked_convo_desc_v2", + Set(TweetsClicked, EngagementsPrivate).asJava) + + val IS_GOOD_CLICKED_CONVO_DESC_FAVORITED = new Binary( + "itl.engagement.is_good_clicked_convo_desc_favorited", + Set(PublicLikes, PrivateLikes).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_REPLIED = new Binary( + "itl.engagement.is_good_clicked_convo_desc_replied", + Set(PublicReplies, PrivateReplies).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_RETWEETED = new Binary( + "itl.engagement.is_good_clicked_convo_desc_retweeted", + Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_CLICKED = new Binary( + "itl.engagement.is_good_clicked_convo_desc_clicked", + Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_FOLLOWED = + new Binary("itl.engagement.is_good_clicked_convo_desc_followed", Set(EngagementsPrivate).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_SHARE_DM_CLICKED = new Binary( + "itl.engagement.is_good_clicked_convo_desc_share_dm_clicked", + Set(EngagementsPrivate).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_PROFILE_CLICKED = new Binary( + "itl.engagement.is_good_clicked_convo_desc_profile_clicked", + Set(EngagementsPrivate).asJava) + + val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_0 = new Binary( + "itl.engagement.is_good_clicked_convo_desc_uam_gt_0", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_1 = new Binary( + "itl.engagement.is_good_clicked_convo_desc_uam_gt_1", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_2 = new Binary( + "itl.engagement.is_good_clicked_convo_desc_uam_gt_2", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_3 = new Binary( + "itl.engagement.is_good_clicked_convo_desc_uam_gt_3", + Set(EngagementsPrivate, EngagementsPublic).asJava) + + val IS_TWEET_DETAIL_DWELLED = new Binary( + "itl.engagement.is_tweet_detail_dwelled", + Set(TweetsClicked, EngagementsPrivate).asJava) + + val IS_TWEET_DETAIL_DWELLED_8_SEC = new Binary( + "itl.engagement.is_tweet_detail_dwelled_8_sec", + Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_TWEET_DETAIL_DWELLED_15_SEC = new Binary( + "itl.engagement.is_tweet_detail_dwelled_15_sec", + Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_TWEET_DETAIL_DWELLED_25_SEC = new Binary( + "itl.engagement.is_tweet_detail_dwelled_25_sec", + Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_TWEET_DETAIL_DWELLED_30_SEC = new Binary( + "itl.engagement.is_tweet_detail_dwelled_30_sec", + Set(TweetsClicked, EngagementsPrivate).asJava) + + val IS_PROFILE_DWELLED = new Binary( + "itl.engagement.is_profile_dwelled", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_PROFILE_DWELLED_10_SEC = new Binary( + "itl.engagement.is_profile_dwelled_10_sec", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_PROFILE_DWELLED_20_SEC = new Binary( + "itl.engagement.is_profile_dwelled_20_sec", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_PROFILE_DWELLED_30_SEC = new Binary( + "itl.engagement.is_profile_dwelled_30_sec", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + + val IS_FULLSCREEN_VIDEO_DWELLED = new Binary( + "itl.engagement.is_fullscreen_video_dwelled", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_FULLSCREEN_VIDEO_DWELLED_5_SEC = new Binary( + "itl.engagement.is_fullscreen_video_dwelled_5_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_FULLSCREEN_VIDEO_DWELLED_10_SEC = new Binary( + "itl.engagement.is_fullscreen_video_dwelled_10_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_FULLSCREEN_VIDEO_DWELLED_20_SEC = new Binary( + "itl.engagement.is_fullscreen_video_dwelled_20_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_FULLSCREEN_VIDEO_DWELLED_30_SEC = new Binary( + "itl.engagement.is_fullscreen_video_dwelled_30_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_LINK_DWELLED_15_SEC = new Binary( + "itl.engagement.is_link_dwelled_15_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_LINK_DWELLED_30_SEC = new Binary( + "itl.engagement.is_link_dwelled_30_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_LINK_DWELLED_60_SEC = new Binary( + "itl.engagement.is_link_dwelled_60_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_QUOTED = + new Binary("itl.engagement.is_quoted", Set(PublicRetweets, PrivateRetweets).asJava) + val IS_RETWEETED_WITHOUT_QUOTE = new Binary( + "itl.engagement.is_retweeted_without_quote", + Set(PublicRetweets, PrivateRetweets).asJava) + val IS_CLICKED = new Binary( + "itl.engagement.is_clicked", + Set(EngagementsPrivate, TweetsClicked, LinksClickedOn).asJava) + val IS_PROFILE_CLICKED = new Binary( + "itl.engagement.is_profile_clicked", + Set(EngagementsPrivate, TweetsClicked, ProfilesViewed, ProfilesClicked).asJava) + val IS_DWELLED = new Binary("itl.engagement.is_dwelled", Set(EngagementsPrivate).asJava) + val IS_DWELLED_IN_BOUNDS_V1 = + new Binary("itl.engagement.is_dwelled_in_bounds_v1", Set(EngagementsPrivate).asJava) + val DWELL_NORMALIZED_OVERALL = + new Continuous("itl.engagement.dwell_normalized_overall", Set(EngagementsPrivate).asJava) + val DWELL_CDF_OVERALL = + new Continuous("itl.engagement.dwell_cdf_overall", Set(EngagementsPrivate).asJava) + val DWELL_CDF = new Continuous("itl.engagement.dwell_cdf", Set(EngagementsPrivate).asJava) + + val IS_DWELLED_1S = new Binary("itl.engagement.is_dwelled_1s", Set(EngagementsPrivate).asJava) + val IS_DWELLED_2S = new Binary("itl.engagement.is_dwelled_2s", Set(EngagementsPrivate).asJava) + val IS_DWELLED_3S = new Binary("itl.engagement.is_dwelled_3s", Set(EngagementsPrivate).asJava) + val IS_DWELLED_4S = new Binary("itl.engagement.is_dwelled_4s", Set(EngagementsPrivate).asJava) + val IS_DWELLED_5S = new Binary("itl.engagement.is_dwelled_5s", Set(EngagementsPrivate).asJava) + val IS_DWELLED_6S = new Binary("itl.engagement.is_dwelled_6s", Set(EngagementsPrivate).asJava) + val IS_DWELLED_7S = new Binary("itl.engagement.is_dwelled_7s", Set(EngagementsPrivate).asJava) + val IS_DWELLED_8S = new Binary("itl.engagement.is_dwelled_8s", Set(EngagementsPrivate).asJava) + val IS_DWELLED_9S = new Binary("itl.engagement.is_dwelled_9s", Set(EngagementsPrivate).asJava) + val IS_DWELLED_10S = new Binary("itl.engagement.is_dwelled_10s", Set(EngagementsPrivate).asJava) + + val IS_SKIPPED_1S = new Binary("itl.engagement.is_skipped_1s", Set(EngagementsPrivate).asJava) + val IS_SKIPPED_2S = new Binary("itl.engagement.is_skipped_2s", Set(EngagementsPrivate).asJava) + val IS_SKIPPED_3S = new Binary("itl.engagement.is_skipped_3s", Set(EngagementsPrivate).asJava) + val IS_SKIPPED_4S = new Binary("itl.engagement.is_skipped_4s", Set(EngagementsPrivate).asJava) + val IS_SKIPPED_5S = new Binary("itl.engagement.is_skipped_5s", Set(EngagementsPrivate).asJava) + val IS_SKIPPED_6S = new Binary("itl.engagement.is_skipped_6s", Set(EngagementsPrivate).asJava) + val IS_SKIPPED_7S = new Binary("itl.engagement.is_skipped_7s", Set(EngagementsPrivate).asJava) + val IS_SKIPPED_8S = new Binary("itl.engagement.is_skipped_8s", Set(EngagementsPrivate).asJava) + val IS_SKIPPED_9S = new Binary("itl.engagement.is_skipped_9s", Set(EngagementsPrivate).asJava) + val IS_SKIPPED_10S = new Binary("itl.engagement.is_skipped_10s", Set(EngagementsPrivate).asJava) + + val IS_FOLLOWED = + new Binary("itl.engagement.is_followed", Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_IMPRESSED = new Binary("itl.engagement.is_impressed", Set(EngagementsPrivate).asJava) + val IS_OPEN_LINKED = + new Binary("itl.engagement.is_open_linked", Set(EngagementsPrivate, LinksClickedOn).asJava) + val IS_PHOTO_EXPANDED = new Binary( + "itl.engagement.is_photo_expanded", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_VIDEO_VIEWED = + new Binary("itl.engagement.is_video_viewed", Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_VIDEO_PLAYBACK_50 = new Binary( + "itl.engagement.is_video_playback_50", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_VIDEO_QUALITY_VIEWED = new Binary( + "itl.engagement.is_video_quality_viewed", + Set(EngagementsPrivate, EngagementsPublic).asJava + ) + val IS_BOOKMARKED = + new Binary("itl.engagement.is_bookmarked", Set(EngagementsPrivate).asJava) + val IS_SHARED = + new Binary("itl.engagement.is_shared", Set(EngagementsPrivate).asJava) + val IS_SHARE_MENU_CLICKED = + new Binary("itl.engagement.is_share_menu_clicked", Set(EngagementsPrivate).asJava) + + // Negative engagements + val IS_DONT_LIKE = + new Binary("itl.engagement.is_dont_like", Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_BLOCK_CLICKED = new Binary( + "itl.engagement.is_block_clicked", + Set(TweetsClicked, EngagementsPrivate, EngagementsPublic).asJava) + val IS_BLOCK_DIALOG_BLOCKED = new Binary( + "itl.engagement.is_block_dialog_blocked", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_MUTE_CLICKED = + new Binary("itl.engagement.is_mute_clicked", Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_MUTE_DIALOG_MUTED = + new Binary("itl.engagement.is_mute_dialog_muted", Set(EngagementsPrivate).asJava) + val IS_REPORT_TWEET_CLICKED = new Binary( + "itl.engagement.is_report_tweet_clicked", + Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_CARET_CLICKED = + new Binary("itl.engagement.is_caret_clicked", Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_NOT_ABOUT_TOPIC = + new Binary("itl.engagement.is_not_about_topic", Set(EngagementsPrivate).asJava) + val IS_NOT_RECENT = + new Binary("itl.engagement.is_not_recent", Set(EngagementsPrivate).asJava) + val IS_NOT_RELEVANT = + new Binary("itl.engagement.is_not_relevant", Set(EngagementsPrivate).asJava) + val IS_SEE_FEWER = + new Binary("itl.engagement.is_see_fewer", Set(EngagementsPrivate).asJava) + val IS_UNFOLLOW_TOPIC = + new Binary("itl.engagement.is_unfollow_topic", Set(EngagementsPrivate).asJava) + val IS_FOLLOW_TOPIC = + new Binary("itl.engagement.is_follow_topic", Set(EngagementsPrivate).asJava) + val IS_NOT_INTERESTED_IN_TOPIC = + new Binary("itl.engagement.is_not_interested_in_topic", Set(EngagementsPrivate).asJava) + val IS_HOME_LATEST_VISITED = + new Binary("itl.engagement.is_home_latest_visited", Set(EngagementsPrivate).asJava) + + // This derived label is the logical OR of IS_DONT_LIKE, IS_BLOCK_CLICKED, IS_MUTE_CLICKED and IS_REPORT_TWEET_CLICKED + val IS_NEGATIVE_FEEDBACK = + new Binary("itl.engagement.is_negative_feedback", Set(EngagementsPrivate).asJava) + + // Reciprocal engagements for reply forward engagement + val IS_REPLIED_REPLY_IMPRESSED_BY_AUTHOR = new Binary( + "itl.engagement.is_replied_reply_impressed_by_author", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_REPLIED_REPLY_FAVORITED_BY_AUTHOR = new Binary( + "itl.engagement.is_replied_reply_favorited_by_author", + Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava) + val IS_REPLIED_REPLY_QUOTED_BY_AUTHOR = new Binary( + "itl.engagement.is_replied_reply_quoted_by_author", + Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) + val IS_REPLIED_REPLY_REPLIED_BY_AUTHOR = new Binary( + "itl.engagement.is_replied_reply_replied_by_author", + Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava) + val IS_REPLIED_REPLY_RETWEETED_BY_AUTHOR = new Binary( + "itl.engagement.is_replied_reply_retweeted_by_author", + Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) + val IS_REPLIED_REPLY_BLOCKED_BY_AUTHOR = new Binary( + "itl.engagement.is_replied_reply_blocked_by_author", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_REPLIED_REPLY_FOLLOWED_BY_AUTHOR = new Binary( + "itl.engagement.is_replied_reply_followed_by_author", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_REPLIED_REPLY_UNFOLLOWED_BY_AUTHOR = new Binary( + "itl.engagement.is_replied_reply_unfollowed_by_author", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_REPLIED_REPLY_MUTED_BY_AUTHOR = new Binary( + "itl.engagement.is_replied_reply_muted_by_author", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_REPLIED_REPLY_REPORTED_BY_AUTHOR = new Binary( + "itl.engagement.is_replied_reply_reported_by_author", + Set(EngagementsPrivate, EngagementsPublic).asJava) + + // This derived label is the logical OR of REPLY_REPLIED, REPLY_FAVORITED, REPLY_RETWEETED + val IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR = new Binary( + "itl.engagement.is_replied_reply_engaged_by_author", + Set(EngagementsPrivate, EngagementsPublic).asJava) + + // Reciprocal engagements for fav forward engagement + val IS_FAVORITED_FAV_FAVORITED_BY_AUTHOR = new Binary( + "itl.engagement.is_favorited_fav_favorited_by_author", + Set(EngagementsPrivate, EngagementsPublic, PrivateLikes, PublicLikes).asJava + ) + val IS_FAVORITED_FAV_REPLIED_BY_AUTHOR = new Binary( + "itl.engagement.is_favorited_fav_replied_by_author", + Set(EngagementsPrivate, EngagementsPublic, PrivateReplies, PublicReplies).asJava + ) + val IS_FAVORITED_FAV_RETWEETED_BY_AUTHOR = new Binary( + "itl.engagement.is_favorited_fav_retweeted_by_author", + Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava + ) + val IS_FAVORITED_FAV_FOLLOWED_BY_AUTHOR = new Binary( + "itl.engagement.is_favorited_fav_followed_by_author", + Set(EngagementsPrivate, EngagementsPublic).asJava + ) + // This derived label is the logical OR of FAV_REPLIED, FAV_FAVORITED, FAV_RETWEETED, FAV_FOLLOWED + val IS_FAVORITED_FAV_ENGAGED_BY_AUTHOR = new Binary( + "itl.engagement.is_favorited_fav_engaged_by_author", + Set(EngagementsPrivate, EngagementsPublic).asJava + ) + + // define good profile click by considering following engagements (follow, fav, reply, retweet, etc.) at profile page + val IS_PROFILE_CLICKED_AND_PROFILE_FOLLOW = new Binary( + "itl.engagement.is_profile_clicked_and_profile_follow", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, Follow).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_FAV = new Binary( + "itl.engagement.is_profile_clicked_and_profile_fav", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, PrivateLikes, PublicLikes).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_REPLY = new Binary( + "itl.engagement.is_profile_clicked_and_profile_reply", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, PrivateReplies, PublicReplies).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_RETWEET = new Binary( + "itl.engagement.is_profile_clicked_and_profile_retweet", + Set( + ProfilesViewed, + ProfilesClicked, + EngagementsPrivate, + PrivateRetweets, + PublicRetweets).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_TWEET_CLICK = new Binary( + "itl.engagement.is_profile_clicked_and_profile_tweet_click", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, TweetsClicked).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_SHARE_DM_CLICK = new Binary( + "itl.engagement.is_profile_clicked_and_profile_share_dm_click", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + // This derived label is the union of all binary features above + val IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED = new Binary( + "itl.engagement.is_profile_clicked_and_profile_engaged", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, EngagementsPublic).asJava) + + // define bad profile click by considering following engagements (user report, tweet report, mute, block, etc) at profile page + val IS_PROFILE_CLICKED_AND_PROFILE_USER_REPORT_CLICK = new Binary( + "itl.engagement.is_profile_clicked_and_profile_user_report_click", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_TWEET_REPORT_CLICK = new Binary( + "itl.engagement.is_profile_clicked_and_profile_tweet_report_click", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_MUTE = new Binary( + "itl.engagement.is_profile_clicked_and_profile_mute", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_BLOCK = new Binary( + "itl.engagement.is_profile_clicked_and_profile_block", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + // This derived label is the union of bad profile click engagements and existing negative feedback + val IS_NEGATIVE_FEEDBACK_V2 = new Binary( + "itl.engagement.is_negative_feedback_v2", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + // engagement for following user from any surface area + val IS_FOLLOWED_FROM_ANY_SURFACE_AREA = new Binary( + "itl.engagement.is_followed_from_any_surface_area", + Set(EngagementsPublic, EngagementsPrivate).asJava) + + // Relevance prompt tweet engagements + val IS_RELEVANCE_PROMPT_YES_CLICKED = + new Binary("itl.engagement.is_relevance_prompt_yes_clicked", Set(EngagementsPrivate).asJava) + + // Reply downvote engagements + val IS_REPLY_DOWNVOTED = + new Binary("itl.engagement.is_reply_downvoted", Set(EngagementsPrivate).asJava) + val IS_REPLY_DOWNVOTE_REMOVED = + new Binary("itl.engagement.is_reply_downvote_removed", Set(EngagementsPrivate).asJava) + + // features from RecommendedTweet + val RECTWEET_SCORE = new Continuous("itl.recommended_tweet_features.rectweet_score") + val NUM_FAVORITING_USERS = new Continuous("itl.recommended_tweet_features.num_favoriting_users") + val NUM_FOLLOWING_USERS = new Continuous("itl.recommended_tweet_features.num_following_users") + val CONTENT_SOURCE_TYPE = new Discrete("itl.recommended_tweet_features.content_source_type") + + val RECOS_SCORE = new Continuous( + "itl.recommended_tweet_features.recos_score", + Set(EngagementScore, UsersRealGraphScore, UsersSalsaScore).asJava) + val AUTHOR_REALGRAPH_SCORE = new Continuous( + "itl.recommended_tweet_features.realgraph_score", + Set(UsersRealGraphScore).asJava) + val AUTHOR_SARUS_SCORE = new Continuous( + "itl.recommended_tweet_features.sarus_score", + Set(EngagementScore, UsersSalsaScore).asJava) + + val NUM_INTERACTING_USERS = new Continuous( + "itl.recommended_tweet_features.num_interacting_users", + Set(EngagementScore).asJava + ) + val MAX_REALGRAPH_SCORE_OF_INTERACTING_USERS = new Continuous( + "itl.recommended_tweet_features.max_realgraph_score_of_interacting_users", + Set(UsersRealGraphScore, EngagementScore).asJava + ) + val SUM_REALGRAPH_SCORE_OF_INTERACTING_USERS = new Continuous( + "itl.recommended_tweet_features.sum_realgraph_score_of_interacting_users", + Set(UsersRealGraphScore, EngagementScore).asJava + ) + val AVG_REALGRAPH_SCORE_OF_INTERACTING_USERS = new Continuous( + "itl.recommended_tweet_features.avg_realgraph_score_of_interacting_users", + Set(UsersRealGraphScore, EngagementScore).asJava + ) + val MAX_SARUS_SCORE_OF_INTERACTING_USERS = new Continuous( + "itl.recommended_tweet_features.max_sarus_score_of_interacting_users", + Set(EngagementScore, UsersSalsaScore).asJava + ) + val SUM_SARUS_SCORE_OF_INTERACTING_USERS = new Continuous( + "itl.recommended_tweet_features.sum_sarus_score_of_interacting_users", + Set(EngagementScore, UsersSalsaScore).asJava + ) + val AVG_SARUS_SCORE_OF_INTERACTING_USERS = new Continuous( + "itl.recommended_tweet_features.avg_sarus_score_of_interacting_users", + Set(EngagementScore, UsersSalsaScore).asJava + ) + + val NUM_INTERACTING_FOLLOWINGS = new Continuous( + "itl.recommended_tweet_features.num_interacting_followings", + Set(EngagementScore).asJava + ) + + // features from HydratedTweetFeatures + val REAL_GRAPH_WEIGHT = + new Continuous("itl.hydrated_tweet_features.real_graph_weight", Set(UsersRealGraphScore).asJava) + val SARUS_GRAPH_WEIGHT = new Continuous("itl.hydrated_tweet_features.sarus_graph_weight") + val FROM_TOP_ENGAGED_USER = new Binary("itl.hydrated_tweet_features.from_top_engaged_user") + val FROM_TOP_INFLUENCER = new Binary("itl.hydrated_tweet_features.from_top_influencer") + val TOPIC_SIM_SEARCHER_INTERSTED_IN_AUTHOR_KNOWN_FOR = new Continuous( + "itl.hydrated_tweet_features.topic_sim_searcher_interested_in_author_known_for" + ) + val TOPIC_SIM_SEARCHER_AUTHOR_BOTH_INTERESTED_IN = new Continuous( + "itl.hydrated_tweet_features.topic_sim_searcher_author_both_interested_in" + ) + val TOPIC_SIM_SEARCHER_AUTHOR_BOTH_KNOWN_FOR = new Continuous( + "itl.hydrated_tweet_features.topic_sim_searcher_author_both_known_for" + ) + val USER_REP = new Continuous("itl.hydrated_tweet_features.user_rep") + val NORMALIZED_PARUS_SCORE = new Continuous("itl.hydrated_tweet_features.normalized_parus_score") + val CONTAINS_MEDIA = new Binary("itl.hydrated_tweet_features.contains_media") + val FROM_NEARBY = new Binary("itl.hydrated_tweet_features.from_nearby") + val TOPIC_SIM_SEARCHER_INTERESTED_IN_TWEET = new Continuous( + "itl.hydrated_tweet_features.topic_sim_searcher_interested_in_tweet" + ) + val MATCHES_UI_LANG = new Binary( + "itl.hydrated_tweet_features.matches_ui_lang", + Set(ProvidedLanguage, InferredLanguage).asJava) + val MATCHES_SEARCHER_MAIN_LANG = new Binary( + "itl.hydrated_tweet_features.matches_searcher_main_lang", + Set(ProvidedLanguage, InferredLanguage).asJava + ) + val MATCHES_SEARCHER_LANGS = new Binary( + "itl.hydrated_tweet_features.matches_searcher_langs", + Set(ProvidedLanguage, InferredLanguage).asJava) + val HAS_CARD = new Binary( + "itl.hydrated_tweet_features.has_card", + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_IMAGE = new Binary( + "itl.hydrated_tweet_features.has_image", + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_NATIVE_IMAGE = new Binary( + "itl.hydrated_tweet_features.has_native_image", + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_VIDEO = new Binary("itl.hydrated_tweet_features.has_video") + val HAS_CONSUMER_VIDEO = new Binary( + "itl.hydrated_tweet_features.has_consumer_video", + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_PRO_VIDEO = new Binary( + "itl.hydrated_tweet_features.has_pro_video", + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_PERISCOPE = new Binary( + "itl.hydrated_tweet_features.has_periscope", + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_VINE = new Binary( + "itl.hydrated_tweet_features.has_vine", + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_NATIVE_VIDEO = new Binary( + "itl.hydrated_tweet_features.has_native_video", + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_LINK = new Binary( + "itl.hydrated_tweet_features.has_link", + Set(UrlFoundFlag, PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val LINK_COUNT = new Continuous( + "itl.hydrated_tweet_features.link_count", + Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava) + val URL_DOMAINS = new SparseBinary( + "itl.hydrated_tweet_features.url_domains", + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_VISIBLE_LINK = new Binary( + "itl.hydrated_tweet_features.has_visible_link", + Set(UrlFoundFlag, PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_NEWS = new Binary( + "itl.hydrated_tweet_features.has_news", + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_TREND = new Binary( + "itl.hydrated_tweet_features.has_trend", + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val BLENDER_SCORE = + new Continuous("itl.hydrated_tweet_features.blender_score", Set(EngagementScore).asJava) + val PARUS_SCORE = + new Continuous("itl.hydrated_tweet_features.parus_score", Set(EngagementScore).asJava) + val TEXT_SCORE = + new Continuous("itl.hydrated_tweet_features.text_score", Set(EngagementScore).asJava) + val BIDIRECTIONAL_REPLY_COUNT = new Continuous( + "itl.hydrated_tweet_features.bidirectional_reply_count", + Set(CountOfPrivateReplies, CountOfPublicReplies).asJava + ) + val UNIDIRECTIONAL_REPLY_COUNT = new Continuous( + "itl.hydrated_tweet_features.unidirectional_reply_count", + Set(CountOfPrivateReplies, CountOfPublicReplies).asJava + ) + val BIDIRECTIONAL_RETWEET_COUNT = new Continuous( + "itl.hydrated_tweet_features.bidirectional_retweet_count", + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava + ) + val UNIDIRECTIONAL_RETWEET_COUNT = new Continuous( + "itl.hydrated_tweet_features.unidirectional_retweet_count", + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava + ) + val BIDIRECTIONAL_FAV_COUNT = new Continuous( + "itl.hydrated_tweet_features.bidirectional_fav_count", + Set(CountOfPrivateLikes, CountOfPublicLikes).asJava + ) + val UNIDIRECTIONAL_FAV_COUNT = new Continuous( + "itl.hydrated_tweet_features.unidirectional_fav_count", + Set(CountOfPrivateLikes, CountOfPublicLikes).asJava + ) + val CONVERSATION_COUNT = new Continuous("itl.hydrated_tweet_features.conversation_count") + val FAV_COUNT = new Continuous( + "itl.hydrated_tweet_features.fav_count", + Set(CountOfPrivateLikes, CountOfPublicLikes).asJava) + val REPLY_COUNT = new Continuous( + "itl.hydrated_tweet_features.reply_count", + Set(CountOfPrivateReplies, CountOfPublicReplies).asJava) + val RETWEET_COUNT = new Continuous( + "itl.hydrated_tweet_features.retweet_count", + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) + val PREV_USER_TWEET_ENGAGEMENT = new Continuous( + "itl.hydrated_tweet_features.prev_user_tweet_enagagement", + Set(EngagementScore, EngagementsPrivate, EngagementsPublic).asJava + ) + val IS_SENSITIVE = new Binary("itl.hydrated_tweet_features.is_sensitive") + val HAS_MULTIPLE_MEDIA = new Binary( + "itl.hydrated_tweet_features.has_multiple_media", + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_MULTIPLE_HASHTAGS_OR_TRENDS = new Binary( + "itl.hydrated_tweet_features.has_multiple_hashtag_or_trend", + Set( + UserVisibleFlag, + CountOfPrivateTweetEntitiesAndMetadata, + CountOfPublicTweetEntitiesAndMetadata).asJava) + val IS_AUTHOR_PROFILE_EGG = + new Binary("itl.hydrated_tweet_features.is_author_profile_egg", Set(ProfileImage).asJava) + val IS_AUTHOR_NEW = + new Binary("itl.hydrated_tweet_features.is_author_new", Set(UserType, UserState).asJava) + val NUM_MENTIONS = new Continuous( + "itl.hydrated_tweet_features.num_mentions", + Set( + UserVisibleFlag, + CountOfPrivateTweetEntitiesAndMetadata, + CountOfPublicTweetEntitiesAndMetadata).asJava) + val NUM_HASHTAGS = new Continuous( + "itl.hydrated_tweet_features.num_hashtags", + Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava) + val LANGUAGE = new Discrete( + "itl.hydrated_tweet_features.language", + Set(ProvidedLanguage, InferredLanguage).asJava) + val LINK_LANGUAGE = new Continuous( + "itl.hydrated_tweet_features.link_language", + Set(ProvidedLanguage, InferredLanguage).asJava) + val IS_AUTHOR_NSFW = + new Binary("itl.hydrated_tweet_features.is_author_nsfw", Set(UserType).asJava) + val IS_AUTHOR_SPAM = + new Binary("itl.hydrated_tweet_features.is_author_spam", Set(UserType).asJava) + val IS_AUTHOR_BOT = new Binary("itl.hydrated_tweet_features.is_author_bot", Set(UserType).asJava) + val IS_OFFENSIVE = new Binary("itl.hydrated_tweet_features.is_offensive") + val FROM_VERIFIED_ACCOUNT = + new Binary("itl.hydrated_tweet_features.from_verified_account", Set(UserVerifiedFlag).asJava) + val EMBEDS_IMPRESSION_COUNT = new Continuous( + "itl.hydrated_tweet_features.embeds_impression_count", + Set(CountOfImpression).asJava) + val EMBEDS_URL_COUNT = + new Continuous("itl.hydrated_tweet_features.embeds_url_count", Set(UrlFoundFlag).asJava) + val FAV_COUNT_V2 = new Continuous( + "recap.earlybird.fav_count_v2", + Set(CountOfPrivateLikes, CountOfPublicLikes).asJava) + val RETWEET_COUNT_V2 = new Continuous( + "recap.earlybird.retweet_count_v2", + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) + val REPLY_COUNT_V2 = new Continuous( + "recap.earlybird.reply_count_v2", + Set(CountOfPrivateReplies, CountOfPublicReplies).asJava) +} diff --git a/src/scala/com/twitter/timelines/prediction/features/list_features/BUILD b/src/scala/com/twitter/timelines/prediction/features/list_features/BUILD new file mode 100644 index 0000000000..6fc497bf3d --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/list_features/BUILD @@ -0,0 +1,9 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "src/java/com/twitter/ml/api:api-base", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + ], +) diff --git a/src/scala/com/twitter/timelines/prediction/features/list_features/ListFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/list_features/ListFeatures.scala new file mode 100644 index 0000000000..ffb00d1f6c --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/list_features/ListFeatures.scala @@ -0,0 +1,24 @@ +package com.twitter.timelines.prediction.features.list_features + +import com.twitter.ml.api.Feature.{Binary, Discrete} +import com.twitter.ml.api.FeatureContext +import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ +import scala.collection.JavaConverters._ + +object ListFeatures { + + // list.id is used for list tweet injections in home. timelines.meta.list_id is used for list tweets in list timeline. + val LIST_ID = new Discrete("list.id") + + val VIEWER_IS_OWNER = + new Binary("list.viewer.is_owner", Set(ListsNonpublicList, ListsPublicList).asJava) + val VIEWER_IS_SUBSCRIBER = new Binary("list.viewer.is_subscriber") + val IS_PINNED_LIST = new Binary("list.is_pinned") + + val featureContext = new FeatureContext( + LIST_ID, + VIEWER_IS_OWNER, + VIEWER_IS_SUBSCRIBER, + IS_PINNED_LIST + ) +} diff --git a/src/scala/com/twitter/timelines/prediction/features/p_home_latest/BUILD b/src/scala/com/twitter/timelines/prediction/features/p_home_latest/BUILD new file mode 100644 index 0000000000..6fc497bf3d --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/p_home_latest/BUILD @@ -0,0 +1,9 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "src/java/com/twitter/ml/api:api-base", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + ], +) diff --git a/src/scala/com/twitter/timelines/prediction/features/p_home_latest/HomeLatestUserFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/p_home_latest/HomeLatestUserFeatures.scala new file mode 100644 index 0000000000..65d721a05a --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/p_home_latest/HomeLatestUserFeatures.scala @@ -0,0 +1,49 @@ +package com.twitter.timelines.prediction.features.p_home_latest + +import com.twitter.ml.api.Feature.{Continuous, Discrete} +import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ +import scala.collection.JavaConverters._ + +object HomeLatestUserFeatures { + val LAST_LOGIN_TIMESTAMP_MS = + new Discrete("home_latest.user_feature.last_login_timestamp_ms", Set(PrivateTimestamp).asJava) +} + +object HomeLatestUserAggregatesFeatures { + + /** + * Used as `timestampFeature` in `OfflineAggregateSource` required by feature aggregations, set to + * the `dateRange` end timestamp by default + */ + val AGGREGATE_TIMESTAMP_MS = + new Discrete("home_latest.user_feature.aggregate_timestamp_ms", Set(PrivateTimestamp).asJava) + val HOME_TOP_IMPRESSIONS = + new Continuous("home_latest.user_feature.home_top_impressions", Set(CountOfImpression).asJava) + val HOME_LATEST_IMPRESSIONS = + new Continuous( + "home_latest.user_feature.home_latest_impressions", + Set(CountOfImpression).asJava) + val HOME_TOP_LAST_LOGIN_TIMESTAMP_MS = + new Discrete( + "home_latest.user_feature.home_top_last_login_timestamp_ms", + Set(PrivateTimestamp).asJava) + val HOME_LATEST_LAST_LOGIN_TIMESTAMP_MS = + new Discrete( + "home_latest.user_feature.home_latest_last_login_timestamp_ms", + Set(PrivateTimestamp).asJava) + val HOME_LATEST_MOST_RECENT_CLICK_TIMESTAMP_MS = + new Discrete( + "home_latest.user_feature.home_latest_most_recent_click_timestamp_ms", + Set(PrivateTimestamp).asJava) +} + +case class HomeLatestUserFeatures(userId: Long, lastLoginTimestampMs: Long) + +case class HomeLatestUserAggregatesFeatures( + userId: Long, + aggregateTimestampMs: Long, + homeTopImpressions: Option[Double], + homeLatestImpressions: Option[Double], + homeTopLastLoginTimestampMs: Option[Long], + homeLatestLastLoginTimestampMs: Option[Long], + homeLatestMostRecentClickTimestampMs: Option[Long]) diff --git a/src/scala/com/twitter/timelines/prediction/features/ppmi/BUILD b/src/scala/com/twitter/timelines/prediction/features/ppmi/BUILD new file mode 100644 index 0000000000..babba31bba --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/ppmi/BUILD @@ -0,0 +1,8 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "src/java/com/twitter/ml/api:api-base", + ], +) diff --git a/src/scala/com/twitter/timelines/prediction/features/ppmi/PpmiFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/ppmi/PpmiFeatures.scala new file mode 100644 index 0000000000..7e6d1dea8c --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/ppmi/PpmiFeatures.scala @@ -0,0 +1,7 @@ +package com.twitter.timelines.prediction.features.ppmi + +import com.twitter.ml.api.Feature.Continuous + +object PpmiDataRecordFeatures { + val PPMI_SCORE = new Continuous("ppmi.source_author.score") +} diff --git a/src/scala/com/twitter/timelines/prediction/features/real_graph/BUILD b/src/scala/com/twitter/timelines/prediction/features/real_graph/BUILD new file mode 100644 index 0000000000..868acec214 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/real_graph/BUILD @@ -0,0 +1,15 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "src/java/com/twitter/ml/api:api-base", + "src/scala/com/twitter/ml/featurestore/catalog/entities/core", + "src/scala/com/twitter/ml/featurestore/catalog/entities/timelines", + "src/scala/com/twitter/ml/featurestore/catalog/features/timelines:realgraph", + "src/scala/com/twitter/ml/featurestore/lib/entity", + "src/scala/com/twitter/ml/featurestore/lib/feature", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + "src/thrift/com/twitter/timelines/real_graph:real_graph-scala", + ], +) diff --git a/src/scala/com/twitter/timelines/prediction/features/real_graph/RealGraphDataRecordFeatureStoreFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/real_graph/RealGraphDataRecordFeatureStoreFeatures.scala new file mode 100644 index 0000000000..7c52349aac --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/real_graph/RealGraphDataRecordFeatureStoreFeatures.scala @@ -0,0 +1,232 @@ +package com.twitter.timelines.prediction.features.real_graph + +import com.twitter.ml.featurestore.catalog.entities.core.UserAuthor +import com.twitter.ml.featurestore.catalog.features.timelines.RealGraph +import com.twitter.ml.featurestore.lib.EdgeEntityId +import com.twitter.ml.featurestore.lib.UserId +import com.twitter.ml.featurestore.lib.feature.BoundFeatureSet +import com.twitter.ml.featurestore.lib.feature.Feature +import com.twitter.ml.featurestore.lib.feature.FeatureSet + +object RealGraphDataRecordFeatureStoreFeatures { + val boundUserAuthorfeatureSet: BoundFeatureSet = FeatureSet( + RealGraph.DestId, + RealGraph.AddressBookEmail.DaysSinceLast, + RealGraph.AddressBookEmail.ElapsedDays, + RealGraph.AddressBookEmail.Ewma, + RealGraph.AddressBookEmail.IsMissing, + RealGraph.AddressBookEmail.Mean, + RealGraph.AddressBookEmail.NonZeroDays, + RealGraph.AddressBookEmail.Variance, + RealGraph.AddressBookInBoth.DaysSinceLast, + RealGraph.AddressBookInBoth.ElapsedDays, + RealGraph.AddressBookInBoth.Ewma, + RealGraph.AddressBookInBoth.IsMissing, + RealGraph.AddressBookInBoth.Mean, + RealGraph.AddressBookInBoth.NonZeroDays, + RealGraph.AddressBookInBoth.Variance, + RealGraph.AddressBookMutualEdgeEmail.DaysSinceLast, + RealGraph.AddressBookMutualEdgeEmail.ElapsedDays, + RealGraph.AddressBookMutualEdgeEmail.Ewma, + RealGraph.AddressBookMutualEdgeEmail.IsMissing, + RealGraph.AddressBookMutualEdgeEmail.Mean, + RealGraph.AddressBookMutualEdgeEmail.NonZeroDays, + RealGraph.AddressBookMutualEdgeEmail.Variance, + RealGraph.AddressBookMutualEdgeInBoth.DaysSinceLast, + RealGraph.AddressBookMutualEdgeInBoth.ElapsedDays, + RealGraph.AddressBookMutualEdgeInBoth.Ewma, + RealGraph.AddressBookMutualEdgeInBoth.IsMissing, + RealGraph.AddressBookMutualEdgeInBoth.Mean, + RealGraph.AddressBookMutualEdgeInBoth.NonZeroDays, + RealGraph.AddressBookMutualEdgeInBoth.Variance, + RealGraph.AddressBookMutualEdgePhone.DaysSinceLast, + RealGraph.AddressBookMutualEdgePhone.ElapsedDays, + RealGraph.AddressBookMutualEdgePhone.Ewma, + RealGraph.AddressBookMutualEdgePhone.IsMissing, + RealGraph.AddressBookMutualEdgePhone.Mean, + RealGraph.AddressBookMutualEdgePhone.NonZeroDays, + RealGraph.AddressBookMutualEdgePhone.Variance, + RealGraph.AddressBookPhone.DaysSinceLast, + RealGraph.AddressBookPhone.ElapsedDays, + RealGraph.AddressBookPhone.Ewma, + RealGraph.AddressBookPhone.IsMissing, + RealGraph.AddressBookPhone.Mean, + RealGraph.AddressBookPhone.NonZeroDays, + RealGraph.AddressBookPhone.Variance, + RealGraph.DirectMessages.DaysSinceLast, + RealGraph.DirectMessages.ElapsedDays, + RealGraph.DirectMessages.Ewma, + RealGraph.DirectMessages.IsMissing, + RealGraph.DirectMessages.Mean, + RealGraph.DirectMessages.NonZeroDays, + RealGraph.DirectMessages.Variance, + RealGraph.DwellTime.DaysSinceLast, + RealGraph.DwellTime.ElapsedDays, + RealGraph.DwellTime.Ewma, + RealGraph.DwellTime.IsMissing, + RealGraph.DwellTime.Mean, + RealGraph.DwellTime.NonZeroDays, + RealGraph.DwellTime.Variance, + RealGraph.Follow.DaysSinceLast, + RealGraph.Follow.ElapsedDays, + RealGraph.Follow.Ewma, + RealGraph.Follow.IsMissing, + RealGraph.Follow.Mean, + RealGraph.Follow.NonZeroDays, + RealGraph.Follow.Variance, + RealGraph.InspectedStatuses.DaysSinceLast, + RealGraph.InspectedStatuses.ElapsedDays, + RealGraph.InspectedStatuses.Ewma, + RealGraph.InspectedStatuses.IsMissing, + RealGraph.InspectedStatuses.Mean, + RealGraph.InspectedStatuses.NonZeroDays, + RealGraph.InspectedStatuses.Variance, + RealGraph.Likes.DaysSinceLast, + RealGraph.Likes.ElapsedDays, + RealGraph.Likes.Ewma, + RealGraph.Likes.IsMissing, + RealGraph.Likes.Mean, + RealGraph.Likes.NonZeroDays, + RealGraph.Likes.Variance, + RealGraph.LinkClicks.DaysSinceLast, + RealGraph.LinkClicks.ElapsedDays, + RealGraph.LinkClicks.Ewma, + RealGraph.LinkClicks.IsMissing, + RealGraph.LinkClicks.Mean, + RealGraph.LinkClicks.NonZeroDays, + RealGraph.LinkClicks.Variance, + RealGraph.Mentions.DaysSinceLast, + RealGraph.Mentions.ElapsedDays, + RealGraph.Mentions.Ewma, + RealGraph.Mentions.IsMissing, + RealGraph.Mentions.Mean, + RealGraph.Mentions.NonZeroDays, + RealGraph.Mentions.Variance, + RealGraph.MutualFollow.DaysSinceLast, + RealGraph.MutualFollow.ElapsedDays, + RealGraph.MutualFollow.Ewma, + RealGraph.MutualFollow.IsMissing, + RealGraph.MutualFollow.Mean, + RealGraph.MutualFollow.NonZeroDays, + RealGraph.MutualFollow.Variance, + RealGraph.NumTweetQuotes.DaysSinceLast, + RealGraph.NumTweetQuotes.ElapsedDays, + RealGraph.NumTweetQuotes.Ewma, + RealGraph.NumTweetQuotes.IsMissing, + RealGraph.NumTweetQuotes.Mean, + RealGraph.NumTweetQuotes.NonZeroDays, + RealGraph.NumTweetQuotes.Variance, + RealGraph.PhotoTags.DaysSinceLast, + RealGraph.PhotoTags.ElapsedDays, + RealGraph.PhotoTags.Ewma, + RealGraph.PhotoTags.IsMissing, + RealGraph.PhotoTags.Mean, + RealGraph.PhotoTags.NonZeroDays, + RealGraph.PhotoTags.Variance, + RealGraph.ProfileViews.DaysSinceLast, + RealGraph.ProfileViews.ElapsedDays, + RealGraph.ProfileViews.Ewma, + RealGraph.ProfileViews.IsMissing, + RealGraph.ProfileViews.Mean, + RealGraph.ProfileViews.NonZeroDays, + RealGraph.ProfileViews.Variance, + RealGraph.Retweets.DaysSinceLast, + RealGraph.Retweets.ElapsedDays, + RealGraph.Retweets.Ewma, + RealGraph.Retweets.IsMissing, + RealGraph.Retweets.Mean, + RealGraph.Retweets.NonZeroDays, + RealGraph.Retweets.Variance, + RealGraph.SmsFollow.DaysSinceLast, + RealGraph.SmsFollow.ElapsedDays, + RealGraph.SmsFollow.Ewma, + RealGraph.SmsFollow.IsMissing, + RealGraph.SmsFollow.Mean, + RealGraph.SmsFollow.NonZeroDays, + RealGraph.SmsFollow.Variance, + RealGraph.TweetClicks.DaysSinceLast, + RealGraph.TweetClicks.ElapsedDays, + RealGraph.TweetClicks.Ewma, + RealGraph.TweetClicks.IsMissing, + RealGraph.TweetClicks.Mean, + RealGraph.TweetClicks.NonZeroDays, + RealGraph.TweetClicks.Variance, + RealGraph.Weight + ).bind(UserAuthor) + + private[this] val edgeFeatures: Seq[RealGraph.EdgeFeature] = Seq( + RealGraph.AddressBookEmail, + RealGraph.AddressBookInBoth, + RealGraph.AddressBookMutualEdgeEmail, + RealGraph.AddressBookMutualEdgeInBoth, + RealGraph.AddressBookMutualEdgePhone, + RealGraph.AddressBookPhone, + RealGraph.DirectMessages, + RealGraph.DwellTime, + RealGraph.Follow, + RealGraph.InspectedStatuses, + RealGraph.Likes, + RealGraph.LinkClicks, + RealGraph.Mentions, + RealGraph.MutualFollow, + RealGraph.PhotoTags, + RealGraph.ProfileViews, + RealGraph.Retweets, + RealGraph.SmsFollow, + RealGraph.TweetClicks + ) + + val htlDoubleFeatures: Set[Feature[EdgeEntityId[UserId, UserId], Double]] = { + val features = edgeFeatures.flatMap { ef => + Seq(ef.Ewma, ef.Mean, ef.Variance) + } ++ Seq(RealGraph.Weight) + features.toSet + } + + val htlLongFeatures: Set[Feature[EdgeEntityId[UserId, UserId], Long]] = { + val features = edgeFeatures.flatMap { ef => + Seq(ef.DaysSinceLast, ef.ElapsedDays, ef.NonZeroDays) + } + features.toSet + } + + private val edgeFeatureToLegacyName = Map( + RealGraph.AddressBookEmail -> "num_address_book_email", + RealGraph.AddressBookInBoth -> "num_address_book_in_both", + RealGraph.AddressBookMutualEdgeEmail -> "num_address_book_mutual_edge_email", + RealGraph.AddressBookMutualEdgeInBoth -> "num_address_book_mutual_edge_in_both", + RealGraph.AddressBookMutualEdgePhone -> "num_address_book_mutual_edge_phone", + RealGraph.AddressBookPhone -> "num_address_book_phone", + RealGraph.DirectMessages -> "direct_messages", + RealGraph.DwellTime -> "total_dwell_time", + RealGraph.Follow -> "num_follow", + RealGraph.InspectedStatuses -> "num_inspected_tweets", + RealGraph.Likes -> "num_favorites", + RealGraph.LinkClicks -> "num_link_clicks", + RealGraph.Mentions -> "num_mentions", + RealGraph.MutualFollow -> "num_mutual_follow", + RealGraph.PhotoTags -> "num_photo_tags", + RealGraph.ProfileViews -> "num_profile_views", + RealGraph.Retweets -> "num_retweets", + RealGraph.SmsFollow -> "num_sms_follow", + RealGraph.TweetClicks -> "num_tweet_clicks", + ) + + def convertFeatureToLegacyName( + prefix: String, + variance: String = "variance" + ): Map[Feature[EdgeEntityId[UserId, UserId], _ >: Long with Double <: AnyVal], String] = + edgeFeatureToLegacyName.flatMap { + case (k, v) => + Seq( + k.NonZeroDays -> s"${prefix}.${v}.non_zero_days", + k.DaysSinceLast -> s"${prefix}.${v}.days_since_last", + k.ElapsedDays -> s"${prefix}.${v}.elapsed_days", + k.Ewma -> s"${prefix}.${v}.ewma", + k.Mean -> s"${prefix}.${v}.mean", + k.Variance -> s"${prefix}.${v}.${variance}", + ) + } ++ Map( + RealGraph.Weight -> (prefix + ".weight") + ) +} diff --git a/src/scala/com/twitter/timelines/prediction/features/real_graph/RealGraphDataRecordFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/real_graph/RealGraphDataRecordFeatures.scala new file mode 100644 index 0000000000..4c19159448 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/real_graph/RealGraphDataRecordFeatures.scala @@ -0,0 +1,534 @@ +package com.twitter.timelines.prediction.features.real_graph + +import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ +import com.twitter.ml.api.Feature._ +import com.twitter.timelines.real_graph.v1.thriftscala.RealGraphEdgeFeature +import scala.collection.JavaConverters._ + + +object RealGraphDataRecordFeatures { + // the source user id + val SRC_ID = new Discrete("realgraph.src_id", Set(UserId).asJava) + // the destination user id + val DST_ID = new Discrete("realgraph.dst_id", Set(UserId).asJava) + // real graph weight + val WEIGHT = new Continuous("realgraph.weight", Set(UsersRealGraphScore).asJava) + // the number of retweets that the source user sent to the destination user + val NUM_RETWEETS_MEAN = + new Continuous("realgraph.num_retweets.mean", Set(PrivateRetweets, PublicRetweets).asJava) + val NUM_RETWEETS_EWMA = + new Continuous("realgraph.num_retweets.ewma", Set(PrivateRetweets, PublicRetweets).asJava) + val NUM_RETWEETS_VARIANCE = + new Continuous("realgraph.num_retweets.variance", Set(PrivateRetweets, PublicRetweets).asJava) + val NUM_RETWEETS_NON_ZERO_DAYS = new Continuous( + "realgraph.num_retweets.non_zero_days", + Set(PrivateRetweets, PublicRetweets).asJava) + val NUM_RETWEETS_ELAPSED_DAYS = new Continuous( + "realgraph.num_retweets.elapsed_days", + Set(PrivateRetweets, PublicRetweets).asJava) + val NUM_RETWEETS_DAYS_SINCE_LAST = new Continuous( + "realgraph.num_retweets.days_since_last", + Set(PrivateRetweets, PublicRetweets).asJava) + val NUM_RETWEETS_IS_MISSING = + new Binary("realgraph.num_retweets.is_missing", Set(PrivateRetweets, PublicRetweets).asJava) + // the number of favories that the source user sent to the destination user + val NUM_FAVORITES_MEAN = + new Continuous("realgraph.num_favorites.mean", Set(PublicLikes, PrivateLikes).asJava) + val NUM_FAVORITES_EWMA = + new Continuous("realgraph.num_favorites.ewma", Set(PublicLikes, PrivateLikes).asJava) + val NUM_FAVORITES_VARIANCE = + new Continuous("realgraph.num_favorites.variance", Set(PublicLikes, PrivateLikes).asJava) + val NUM_FAVORITES_NON_ZERO_DAYS = + new Continuous("realgraph.num_favorites.non_zero_days", Set(PublicLikes, PrivateLikes).asJava) + val NUM_FAVORITES_ELAPSED_DAYS = + new Continuous("realgraph.num_favorites.elapsed_days", Set(PublicLikes, PrivateLikes).asJava) + val NUM_FAVORITES_DAYS_SINCE_LAST = + new Continuous("realgraph.num_favorites.days_since_last", Set(PublicLikes, PrivateLikes).asJava) + val NUM_FAVORITES_IS_MISSING = + new Binary("realgraph.num_favorites.is_missing", Set(PublicLikes, PrivateLikes).asJava) + // the number of mentions that the source user sent to the destination user + val NUM_MENTIONS_MEAN = + new Continuous("realgraph.num_mentions.mean", Set(EngagementsPrivate, EngagementsPublic).asJava) + val NUM_MENTIONS_EWMA = + new Continuous("realgraph.num_mentions.ewma", Set(EngagementsPrivate, EngagementsPublic).asJava) + val NUM_MENTIONS_VARIANCE = new Continuous( + "realgraph.num_mentions.variance", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val NUM_MENTIONS_NON_ZERO_DAYS = new Continuous( + "realgraph.num_mentions.non_zero_days", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val NUM_MENTIONS_ELAPSED_DAYS = new Continuous( + "realgraph.num_mentions.elapsed_days", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val NUM_MENTIONS_DAYS_SINCE_LAST = new Continuous( + "realgraph.num_mentions.days_since_last", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val NUM_MENTIONS_IS_MISSING = new Binary( + "realgraph.num_mentions.is_missing", + Set(EngagementsPrivate, EngagementsPublic).asJava) + // the number of direct messages that the source user sent to the destination user + val NUM_DIRECT_MESSAGES_MEAN = new Continuous( + "realgraph.num_direct_messages.mean", + Set(DmEntitiesAndMetadata, CountOfDms).asJava) + val NUM_DIRECT_MESSAGES_EWMA = new Continuous( + "realgraph.num_direct_messages.ewma", + Set(DmEntitiesAndMetadata, CountOfDms).asJava) + val NUM_DIRECT_MESSAGES_VARIANCE = new Continuous( + "realgraph.num_direct_messages.variance", + Set(DmEntitiesAndMetadata, CountOfDms).asJava) + val NUM_DIRECT_MESSAGES_NON_ZERO_DAYS = new Continuous( + "realgraph.num_direct_messages.non_zero_days", + Set(DmEntitiesAndMetadata, CountOfDms).asJava + ) + val NUM_DIRECT_MESSAGES_ELAPSED_DAYS = new Continuous( + "realgraph.num_direct_messages.elapsed_days", + Set(DmEntitiesAndMetadata, CountOfDms).asJava + ) + val NUM_DIRECT_MESSAGES_DAYS_SINCE_LAST = new Continuous( + "realgraph.num_direct_messages.days_since_last", + Set(DmEntitiesAndMetadata, CountOfDms).asJava + ) + val NUM_DIRECT_MESSAGES_IS_MISSING = new Binary( + "realgraph.num_direct_messages.is_missing", + Set(DmEntitiesAndMetadata, CountOfDms).asJava) + // the number of tweet clicks that the source user sent to the destination user + val NUM_TWEET_CLICKS_MEAN = + new Continuous("realgraph.num_tweet_clicks.mean", Set(TweetsClicked).asJava) + val NUM_TWEET_CLICKS_EWMA = + new Continuous("realgraph.num_tweet_clicks.ewma", Set(TweetsClicked).asJava) + val NUM_TWEET_CLICKS_VARIANCE = + new Continuous("realgraph.num_tweet_clicks.variance", Set(TweetsClicked).asJava) + val NUM_TWEET_CLICKS_NON_ZERO_DAYS = + new Continuous("realgraph.num_tweet_clicks.non_zero_days", Set(TweetsClicked).asJava) + val NUM_TWEET_CLICKS_ELAPSED_DAYS = + new Continuous("realgraph.num_tweet_clicks.elapsed_days", Set(TweetsClicked).asJava) + val NUM_TWEET_CLICKS_DAYS_SINCE_LAST = new Continuous( + "realgraph.num_tweet_clicks.days_since_last", + Set(TweetsClicked).asJava + ) + val NUM_TWEET_CLICKS_IS_MISSING = + new Binary("realgraph.num_tweet_clicks.is_missing", Set(TweetsClicked).asJava) + // the number of link clicks that the source user sent to the destination user + val NUM_LINK_CLICKS_MEAN = + new Continuous("realgraph.num_link_clicks.mean", Set(CountOfTweetEntitiesClicked).asJava) + val NUM_LINK_CLICKS_EWMA = + new Continuous("realgraph.num_link_clicks.ewma", Set(CountOfTweetEntitiesClicked).asJava) + val NUM_LINK_CLICKS_VARIANCE = + new Continuous("realgraph.num_link_clicks.variance", Set(CountOfTweetEntitiesClicked).asJava) + val NUM_LINK_CLICKS_NON_ZERO_DAYS = new Continuous( + "realgraph.num_link_clicks.non_zero_days", + Set(CountOfTweetEntitiesClicked).asJava) + val NUM_LINK_CLICKS_ELAPSED_DAYS = new Continuous( + "realgraph.num_link_clicks.elapsed_days", + Set(CountOfTweetEntitiesClicked).asJava) + val NUM_LINK_CLICKS_DAYS_SINCE_LAST = new Continuous( + "realgraph.num_link_clicks.days_since_last", + Set(CountOfTweetEntitiesClicked).asJava) + val NUM_LINK_CLICKS_IS_MISSING = + new Binary("realgraph.num_link_clicks.is_missing", Set(CountOfTweetEntitiesClicked).asJava) + // the number of profile views that the source user sent to the destination user + val NUM_PROFILE_VIEWS_MEAN = + new Continuous("realgraph.num_profile_views.mean", Set(ProfilesViewed).asJava) + val NUM_PROFILE_VIEWS_EWMA = + new Continuous("realgraph.num_profile_views.ewma", Set(ProfilesViewed).asJava) + val NUM_PROFILE_VIEWS_VARIANCE = + new Continuous("realgraph.num_profile_views.variance", Set(ProfilesViewed).asJava) + val NUM_PROFILE_VIEWS_NON_ZERO_DAYS = + new Continuous("realgraph.num_profile_views.non_zero_days", Set(ProfilesViewed).asJava) + val NUM_PROFILE_VIEWS_ELAPSED_DAYS = + new Continuous("realgraph.num_profile_views.elapsed_days", Set(ProfilesViewed).asJava) + val NUM_PROFILE_VIEWS_DAYS_SINCE_LAST = new Continuous( + "realgraph.num_profile_views.days_since_last", + Set(ProfilesViewed).asJava + ) + val NUM_PROFILE_VIEWS_IS_MISSING = + new Binary("realgraph.num_profile_views.is_missing", Set(ProfilesViewed).asJava) + // the total dwell time the source user spends on the target user's tweets + val TOTAL_DWELL_TIME_MEAN = + new Continuous("realgraph.total_dwell_time.mean", Set(CountOfImpression).asJava) + val TOTAL_DWELL_TIME_EWMA = + new Continuous("realgraph.total_dwell_time.ewma", Set(CountOfImpression).asJava) + val TOTAL_DWELL_TIME_VARIANCE = + new Continuous("realgraph.total_dwell_time.variance", Set(CountOfImpression).asJava) + val TOTAL_DWELL_TIME_NON_ZERO_DAYS = + new Continuous("realgraph.total_dwell_time.non_zero_days", Set(CountOfImpression).asJava) + val TOTAL_DWELL_TIME_ELAPSED_DAYS = + new Continuous("realgraph.total_dwell_time.elapsed_days", Set(CountOfImpression).asJava) + val TOTAL_DWELL_TIME_DAYS_SINCE_LAST = new Continuous( + "realgraph.total_dwell_time.days_since_last", + Set(CountOfImpression).asJava + ) + val TOTAL_DWELL_TIME_IS_MISSING = + new Binary("realgraph.total_dwell_time.is_missing", Set(CountOfImpression).asJava) + // the number of the target user's tweets that the source user has inspected + val NUM_INSPECTED_TWEETS_MEAN = + new Continuous("realgraph.num_inspected_tweets.mean", Set(CountOfImpression).asJava) + val NUM_INSPECTED_TWEETS_EWMA = + new Continuous("realgraph.num_inspected_tweets.ewma", Set(CountOfImpression).asJava) + val NUM_INSPECTED_TWEETS_VARIANCE = + new Continuous("realgraph.num_inspected_tweets.variance", Set(CountOfImpression).asJava) + val NUM_INSPECTED_TWEETS_NON_ZERO_DAYS = new Continuous( + "realgraph.num_inspected_tweets.non_zero_days", + Set(CountOfImpression).asJava + ) + val NUM_INSPECTED_TWEETS_ELAPSED_DAYS = new Continuous( + "realgraph.num_inspected_tweets.elapsed_days", + Set(CountOfImpression).asJava + ) + val NUM_INSPECTED_TWEETS_DAYS_SINCE_LAST = new Continuous( + "realgraph.num_inspected_tweets.days_since_last", + Set(CountOfImpression).asJava + ) + val NUM_INSPECTED_TWEETS_IS_MISSING = + new Binary("realgraph.num_inspected_tweets.is_missing", Set(CountOfImpression).asJava) + // the number of photos in which the source user has tagged the target user + val NUM_PHOTO_TAGS_MEAN = new Continuous( + "realgraph.num_photo_tags.mean", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val NUM_PHOTO_TAGS_EWMA = new Continuous( + "realgraph.num_photo_tags.ewma", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val NUM_PHOTO_TAGS_VARIANCE = new Continuous( + "realgraph.num_photo_tags.variance", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val NUM_PHOTO_TAGS_NON_ZERO_DAYS = new Continuous( + "realgraph.num_photo_tags.non_zero_days", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val NUM_PHOTO_TAGS_ELAPSED_DAYS = new Continuous( + "realgraph.num_photo_tags.elapsed_days", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val NUM_PHOTO_TAGS_DAYS_SINCE_LAST = new Continuous( + "realgraph.num_photo_tags.days_since_last", + Set(EngagementsPrivate, EngagementsPublic).asJava) + val NUM_PHOTO_TAGS_IS_MISSING = new Binary( + "realgraph.num_photo_tags.is_missing", + Set(EngagementsPrivate, EngagementsPublic).asJava) + + val NUM_FOLLOW_MEAN = new Continuous( + "realgraph.num_follow.mean", + Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) + val NUM_FOLLOW_EWMA = new Continuous( + "realgraph.num_follow.ewma", + Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) + val NUM_FOLLOW_VARIANCE = new Continuous( + "realgraph.num_follow.variance", + Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) + val NUM_FOLLOW_NON_ZERO_DAYS = new Continuous( + "realgraph.num_follow.non_zero_days", + Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) + val NUM_FOLLOW_ELAPSED_DAYS = new Continuous( + "realgraph.num_follow.elapsed_days", + Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) + val NUM_FOLLOW_DAYS_SINCE_LAST = new Continuous( + "realgraph.num_follow.days_since_last", + Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) + val NUM_FOLLOW_IS_MISSING = new Binary( + "realgraph.num_follow.is_missing", + Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) + // the number of blocks that the source user sent to the destination user + val NUM_BLOCKS_MEAN = + new Continuous("realgraph.num_blocks.mean", Set(CountOfBlocks).asJava) + val NUM_BLOCKS_EWMA = + new Continuous("realgraph.num_blocks.ewma", Set(CountOfBlocks).asJava) + val NUM_BLOCKS_VARIANCE = + new Continuous("realgraph.num_blocks.variance", Set(CountOfBlocks).asJava) + val NUM_BLOCKS_NON_ZERO_DAYS = + new Continuous("realgraph.num_blocks.non_zero_days", Set(CountOfBlocks).asJava) + val NUM_BLOCKS_ELAPSED_DAYS = + new Continuous("realgraph.num_blocks.elapsed_days", Set(CountOfBlocks).asJava) + val NUM_BLOCKS_DAYS_SINCE_LAST = + new Continuous("realgraph.num_blocks.days_since_last", Set(CountOfBlocks).asJava) + val NUM_BLOCKS_IS_MISSING = + new Binary("realgraph.num_blocks.is_missing", Set(CountOfBlocks).asJava) + // the number of mutes that the source user sent to the destination user + val NUM_MUTES_MEAN = + new Continuous("realgraph.num_mutes.mean", Set(CountOfMutes).asJava) + val NUM_MUTES_EWMA = + new Continuous("realgraph.num_mutes.ewma", Set(CountOfMutes).asJava) + val NUM_MUTES_VARIANCE = + new Continuous("realgraph.num_mutes.variance", Set(CountOfMutes).asJava) + val NUM_MUTES_NON_ZERO_DAYS = + new Continuous("realgraph.num_mutes.non_zero_days", Set(CountOfMutes).asJava) + val NUM_MUTES_ELAPSED_DAYS = + new Continuous("realgraph.num_mutes.elapsed_days", Set(CountOfMutes).asJava) + val NUM_MUTES_DAYS_SINCE_LAST = + new Continuous("realgraph.num_mutes.days_since_last", Set(CountOfMutes).asJava) + val NUM_MUTES_IS_MISSING = + new Binary("realgraph.num_mutes.is_missing", Set(CountOfMutes).asJava) + // the number of report as abuses that the source user sent to the destination user + val NUM_REPORTS_AS_ABUSES_MEAN = + new Continuous("realgraph.num_report_as_abuses.mean", Set(CountOfAbuseReports).asJava) + val NUM_REPORTS_AS_ABUSES_EWMA = + new Continuous("realgraph.num_report_as_abuses.ewma", Set(CountOfAbuseReports).asJava) + val NUM_REPORTS_AS_ABUSES_VARIANCE = + new Continuous("realgraph.num_report_as_abuses.variance", Set(CountOfAbuseReports).asJava) + val NUM_REPORTS_AS_ABUSES_NON_ZERO_DAYS = + new Continuous("realgraph.num_report_as_abuses.non_zero_days", Set(CountOfAbuseReports).asJava) + val NUM_REPORTS_AS_ABUSES_ELAPSED_DAYS = + new Continuous("realgraph.num_report_as_abuses.elapsed_days", Set(CountOfAbuseReports).asJava) + val NUM_REPORTS_AS_ABUSES_DAYS_SINCE_LAST = + new Continuous( + "realgraph.num_report_as_abuses.days_since_last", + Set(CountOfAbuseReports).asJava) + val NUM_REPORTS_AS_ABUSES_IS_MISSING = + new Binary("realgraph.num_report_as_abuses.is_missing", Set(CountOfAbuseReports).asJava) + // the number of report as spams that the source user sent to the destination user + val NUM_REPORTS_AS_SPAMS_MEAN = + new Continuous( + "realgraph.num_report_as_spams.mean", + Set(CountOfAbuseReports, SafetyRelationships).asJava) + val NUM_REPORTS_AS_SPAMS_EWMA = + new Continuous( + "realgraph.num_report_as_spams.ewma", + Set(CountOfAbuseReports, SafetyRelationships).asJava) + val NUM_REPORTS_AS_SPAMS_VARIANCE = + new Continuous( + "realgraph.num_report_as_spams.variance", + Set(CountOfAbuseReports, SafetyRelationships).asJava) + val NUM_REPORTS_AS_SPAMS_NON_ZERO_DAYS = + new Continuous( + "realgraph.num_report_as_spams.non_zero_days", + Set(CountOfAbuseReports, SafetyRelationships).asJava) + val NUM_REPORTS_AS_SPAMS_ELAPSED_DAYS = + new Continuous( + "realgraph.num_report_as_spams.elapsed_days", + Set(CountOfAbuseReports, SafetyRelationships).asJava) + val NUM_REPORTS_AS_SPAMS_DAYS_SINCE_LAST = + new Continuous( + "realgraph.num_report_as_spams.days_since_last", + Set(CountOfAbuseReports, SafetyRelationships).asJava) + val NUM_REPORTS_AS_SPAMS_IS_MISSING = + new Binary( + "realgraph.num_report_as_spams.is_missing", + Set(CountOfAbuseReports, SafetyRelationships).asJava) + + val NUM_MUTUAL_FOLLOW_MEAN = new Continuous( + "realgraph.num_mutual_follow.mean", + Set( + Follow, + PrivateAccountsFollowedBy, + PublicAccountsFollowedBy, + PrivateAccountsFollowing, + PublicAccountsFollowing).asJava + ) + val NUM_MUTUAL_FOLLOW_EWMA = new Continuous( + "realgraph.num_mutual_follow.ewma", + Set( + Follow, + PrivateAccountsFollowedBy, + PublicAccountsFollowedBy, + PrivateAccountsFollowing, + PublicAccountsFollowing).asJava + ) + val NUM_MUTUAL_FOLLOW_VARIANCE = new Continuous( + "realgraph.num_mutual_follow.variance", + Set( + Follow, + PrivateAccountsFollowedBy, + PublicAccountsFollowedBy, + PrivateAccountsFollowing, + PublicAccountsFollowing).asJava + ) + val NUM_MUTUAL_FOLLOW_NON_ZERO_DAYS = new Continuous( + "realgraph.num_mutual_follow.non_zero_days", + Set( + Follow, + PrivateAccountsFollowedBy, + PublicAccountsFollowedBy, + PrivateAccountsFollowing, + PublicAccountsFollowing).asJava + ) + val NUM_MUTUAL_FOLLOW_ELAPSED_DAYS = new Continuous( + "realgraph.num_mutual_follow.elapsed_days", + Set( + Follow, + PrivateAccountsFollowedBy, + PublicAccountsFollowedBy, + PrivateAccountsFollowing, + PublicAccountsFollowing).asJava + ) + val NUM_MUTUAL_FOLLOW_DAYS_SINCE_LAST = new Continuous( + "realgraph.num_mutual_follow.days_since_last", + Set( + Follow, + PrivateAccountsFollowedBy, + PublicAccountsFollowedBy, + PrivateAccountsFollowing, + PublicAccountsFollowing).asJava + ) + val NUM_MUTUAL_FOLLOW_IS_MISSING = new Binary( + "realgraph.num_mutual_follow.is_missing", + Set( + Follow, + PrivateAccountsFollowedBy, + PublicAccountsFollowedBy, + PrivateAccountsFollowing, + PublicAccountsFollowing).asJava + ) + + val NUM_SMS_FOLLOW_MEAN = new Continuous( + "realgraph.num_sms_follow.mean", + Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) + val NUM_SMS_FOLLOW_EWMA = new Continuous( + "realgraph.num_sms_follow.ewma", + Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) + val NUM_SMS_FOLLOW_VARIANCE = new Continuous( + "realgraph.num_sms_follow.variance", + Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) + val NUM_SMS_FOLLOW_NON_ZERO_DAYS = new Continuous( + "realgraph.num_sms_follow.non_zero_days", + Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) + val NUM_SMS_FOLLOW_ELAPSED_DAYS = new Continuous( + "realgraph.num_sms_follow.elapsed_days", + Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) + val NUM_SMS_FOLLOW_DAYS_SINCE_LAST = new Continuous( + "realgraph.num_sms_follow.days_since_last", + Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) + val NUM_SMS_FOLLOW_IS_MISSING = new Binary( + "realgraph.num_sms_follow.is_missing", + Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) + + val NUM_ADDRESS_BOOK_EMAIL_MEAN = + new Continuous("realgraph.num_address_book_email.mean", Set(AddressBook).asJava) + val NUM_ADDRESS_BOOK_EMAIL_EWMA = + new Continuous("realgraph.num_address_book_email.ewma", Set(AddressBook).asJava) + val NUM_ADDRESS_BOOK_EMAIL_VARIANCE = + new Continuous("realgraph.num_address_book_email.variance", Set(AddressBook).asJava) + val NUM_ADDRESS_BOOK_EMAIL_NON_ZERO_DAYS = new Continuous( + "realgraph.num_address_book_email.non_zero_days", + Set(AddressBook).asJava + ) + val NUM_ADDRESS_BOOK_EMAIL_ELAPSED_DAYS = new Continuous( + "realgraph.num_address_book_email.elapsed_days", + Set(AddressBook).asJava + ) + val NUM_ADDRESS_BOOK_EMAIL_DAYS_SINCE_LAST = new Continuous( + "realgraph.num_address_book_email.days_since_last", + Set(AddressBook).asJava + ) + val NUM_ADDRESS_BOOK_EMAIL_IS_MISSING = + new Binary("realgraph.num_address_book_email.is_missing", Set(AddressBook).asJava) + + val NUM_ADDRESS_BOOK_IN_BOTH_MEAN = + new Continuous("realgraph.num_address_book_in_both.mean", Set(AddressBook).asJava) + val NUM_ADDRESS_BOOK_IN_BOTH_EWMA = + new Continuous("realgraph.num_address_book_in_both.ewma", Set(AddressBook).asJava) + val NUM_ADDRESS_BOOK_IN_BOTH_VARIANCE = new Continuous( + "realgraph.num_address_book_in_both.variance", + Set(AddressBook).asJava + ) + val NUM_ADDRESS_BOOK_IN_BOTH_NON_ZERO_DAYS = new Continuous( + "realgraph.num_address_book_in_both.non_zero_days", + Set(AddressBook).asJava + ) + val NUM_ADDRESS_BOOK_IN_BOTH_ELAPSED_DAYS = new Continuous( + "realgraph.num_address_book_in_both.elapsed_days", + Set(AddressBook).asJava + ) + val NUM_ADDRESS_BOOK_IN_BOTH_DAYS_SINCE_LAST = new Continuous( + "realgraph.num_address_book_in_both.days_since_last", + Set(AddressBook).asJava + ) + val NUM_ADDRESS_BOOK_IN_BOTH_IS_MISSING = new Binary( + "realgraph.num_address_book_in_both.is_missing", + Set(AddressBook).asJava + ) + + val NUM_ADDRESS_BOOK_PHONE_MEAN = + new Continuous("realgraph.num_address_book_phone.mean", Set(AddressBook).asJava) + val NUM_ADDRESS_BOOK_PHONE_EWMA = + new Continuous("realgraph.num_address_book_phone.ewma", Set(AddressBook).asJava) + val NUM_ADDRESS_BOOK_PHONE_VARIANCE = + new Continuous("realgraph.num_address_book_phone.variance", Set(AddressBook).asJava) + val NUM_ADDRESS_BOOK_PHONE_NON_ZERO_DAYS = new Continuous( + "realgraph.num_address_book_phone.non_zero_days", + Set(AddressBook).asJava + ) + val NUM_ADDRESS_BOOK_PHONE_ELAPSED_DAYS = new Continuous( + "realgraph.num_address_book_phone.elapsed_days", + Set(AddressBook).asJava + ) + val NUM_ADDRESS_BOOK_PHONE_DAYS_SINCE_LAST = new Continuous( + "realgraph.num_address_book_phone.days_since_last", + Set(AddressBook).asJava + ) + val NUM_ADDRESS_BOOK_PHONE_IS_MISSING = + new Binary("realgraph.num_address_book_phone.is_missing", Set(AddressBook).asJava) + + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_MEAN = + new Continuous("realgraph.num_address_book_mutual_edge_email.mean", Set(AddressBook).asJava) + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_EWMA = + new Continuous("realgraph.num_address_book_mutual_edge_email.ewma", Set(AddressBook).asJava) + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_VARIANCE = + new Continuous("realgraph.num_address_book_mutual_edge_email.variance", Set(AddressBook).asJava) + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_NON_ZERO_DAYS = new Continuous( + "realgraph.num_address_book_mutual_edge_email.non_zero_days", + Set(AddressBook).asJava + ) + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_ELAPSED_DAYS = new Continuous( + "realgraph.num_address_book_mutual_edge_email.elapsed_days", + Set(AddressBook).asJava + ) + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_DAYS_SINCE_LAST = new Continuous( + "realgraph.num_address_book_mutual_edge_email.days_since_last", + Set(AddressBook).asJava + ) + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_IS_MISSING = + new Binary("realgraph.num_address_book_mutual_edge_email.is_missing", Set(AddressBook).asJava) + + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_MEAN = + new Continuous("realgraph.num_address_book_mutual_edge_in_both.mean", Set(AddressBook).asJava) + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_EWMA = + new Continuous("realgraph.num_address_book_mutual_edge_in_both.ewma", Set(AddressBook).asJava) + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_VARIANCE = new Continuous( + "realgraph.num_address_book_mutual_edge_in_both.variance", + Set(AddressBook).asJava + ) + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_NON_ZERO_DAYS = new Continuous( + "realgraph.num_address_book_mutual_edge_in_both.non_zero_days", + Set(AddressBook).asJava + ) + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_ELAPSED_DAYS = new Continuous( + "realgraph.num_address_book_mutual_edge_in_both.elapsed_days", + Set(AddressBook).asJava + ) + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_DAYS_SINCE_LAST = new Continuous( + "realgraph.num_address_book_mutual_edge_in_both.days_since_last", + Set(AddressBook).asJava + ) + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_IS_MISSING = new Binary( + "realgraph.num_address_book_mutual_edge_in_both.is_missing", + Set(AddressBook).asJava + ) + + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_MEAN = + new Continuous("realgraph.num_address_book_mutual_edge_phone.mean", Set(AddressBook).asJava) + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_EWMA = + new Continuous("realgraph.num_address_book_mutual_edge_phone.ewma", Set(AddressBook).asJava) + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_VARIANCE = + new Continuous("realgraph.num_address_book_mutual_edge_phone.variance", Set(AddressBook).asJava) + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_NON_ZERO_DAYS = new Continuous( + "realgraph.num_address_book_mutual_edge_phone.non_zero_days", + Set(AddressBook).asJava + ) + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_ELAPSED_DAYS = new Continuous( + "realgraph.num_address_book_mutual_edge_phone.elapsed_days", + Set(AddressBook).asJava + ) + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_DAYS_SINCE_LAST = new Continuous( + "realgraph.num_address_book_mutual_edge_phone.days_since_last", + Set(AddressBook).asJava + ) + val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_IS_MISSING = + new Binary("realgraph.num_address_book_mutual_edge_phone.is_missing", Set(AddressBook).asJava) +} + +case class RealGraphEdgeDataRecordFeatures( + edgeFeatureOpt: Option[RealGraphEdgeFeature], + meanFeature: Continuous, + ewmaFeature: Continuous, + varianceFeature: Continuous, + nonZeroDaysFeature: Continuous, + elapsedDaysFeature: Continuous, + daysSinceLastFeature: Continuous, + isMissingFeature: Binary) diff --git a/src/scala/com/twitter/timelines/prediction/features/recap/BUILD b/src/scala/com/twitter/timelines/prediction/features/recap/BUILD new file mode 100644 index 0000000000..6fc497bf3d --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/recap/BUILD @@ -0,0 +1,9 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "src/java/com/twitter/ml/api:api-base", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + ], +) diff --git a/src/scala/com/twitter/timelines/prediction/features/recap/RecapFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/recap/RecapFeatures.scala new file mode 100644 index 0000000000..c8ee6da7dd --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/recap/RecapFeatures.scala @@ -0,0 +1,967 @@ +package com.twitter.timelines.prediction.features.recap + +import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ +import com.twitter.ml.api.Feature.Binary +import com.twitter.ml.api.Feature.Continuous +import com.twitter.ml.api.Feature.Discrete +import com.twitter.ml.api.Feature.SparseBinary +import com.twitter.ml.api.Feature.Text +import scala.collection.JavaConverters._ + +object RecapFeatures extends RecapFeatures("") +object InReplyToRecapFeatures extends RecapFeatures("in_reply_to_tweet") + +class RecapFeatures(prefix: String) { + private def name(featureName: String): String = { + if (prefix.nonEmpty) { + s"$prefix.$featureName" + } else { + featureName + } + } + + val IS_IPAD_CLIENT = new Binary(name("recap.client.is_ipad"), Set(ClientType).asJava) + val IS_WEB_CLIENT = new Binary(name("recap.client.is_web"), Set(ClientType).asJava) + val IS_IPHONE_CLIENT = new Binary(name("recap.client.is_phone"), Set(ClientType).asJava) + val IS_ANDROID_CLIENT = new Binary(name("recap.client.is_android"), Set(ClientType).asJava) + val IS_ANDROID_TABLET_CLIENT = + new Binary(name("recap.client.is_android_tablet"), Set(ClientType).asJava) + + // features from userAgent + val CLIENT_NAME = new Text(name("recap.user_agent.client_name"), Set(ClientType).asJava) + val CLIENT_SOURCE = new Discrete(name("recap.user_agent.client_source"), Set(ClientType).asJava) + val CLIENT_VERSION = new Text(name("recap.user_agent.client_version"), Set(ClientVersion).asJava) + val CLIENT_VERSION_CODE = + new Text(name("recap.user_agent.client_version_code"), Set(ClientVersion).asJava) + val DEVICE = new Text(name("recap.user_agent.device"), Set(DeviceType).asJava) + val FROM_DOG_FOOD = new Binary(name("recap.meta.from_dog_food"), Set(UserAgent).asJava) + val FROM_TWITTER_CLIENT = + new Binary(name("recap.user_agent.from_twitter_client"), Set(UserAgent).asJava) + val MANUFACTURER = new Text(name("recap.user_agent.manufacturer"), Set(UserAgent).asJava) + val MODEL = new Text(name("recap.user_agent.model"), Set(UserAgent).asJava) + val NETWORK_CONNECTION = + new Discrete(name("recap.user_agent.network_connection"), Set(UserAgent).asJava) + val SDK_VERSION = new Text(name("recap.user_agent.sdk_version"), Set(AppId, UserAgent).asJava) + + // engagement + val IS_RETWEETED = new Binary( + name("recap.engagement.is_retweeted"), + Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) + val IS_FAVORITED = new Binary( + name("recap.engagement.is_favorited"), + Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava) + val IS_REPLIED = new Binary( + name("recap.engagement.is_replied"), + Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava) + // v1: post click engagements: fav, reply + val IS_GOOD_CLICKED_CONVO_DESC_V1 = new Binary( + name("recap.engagement.is_good_clicked_convo_desc_favorited_or_replied"), + Set( + PublicLikes, + PrivateLikes, + PublicReplies, + PrivateReplies, + EngagementsPrivate, + EngagementsPublic).asJava) + // v2: post click engagements: click + val IS_GOOD_CLICKED_CONVO_DESC_V2 = new Binary( + name("recap.engagement.is_good_clicked_convo_desc_v2"), + Set(TweetsClicked, EngagementsPrivate).asJava) + + val IS_GOOD_CLICKED_CONVO_DESC_FAVORITED = new Binary( + name("recap.engagement.is_good_clicked_convo_desc_favorited"), + Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_REPLIED = new Binary( + name("recap.engagement.is_good_clicked_convo_desc_replied"), + Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_RETWEETED = new Binary( + name("recap.engagement.is_good_clicked_convo_desc_retweeted"), + Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_CLICKED = new Binary( + name("recap.engagement.is_good_clicked_convo_desc_clicked"), + Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_FOLLOWED = new Binary( + name("recap.engagement.is_good_clicked_convo_desc_followed"), + Set(EngagementsPrivate).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_SHARE_DM_CLICKED = new Binary( + name("recap.engagement.is_good_clicked_convo_desc_share_dm_clicked"), + Set(EngagementsPrivate).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_PROFILE_CLICKED = new Binary( + name("recap.engagement.is_good_clicked_convo_desc_profile_clicked"), + Set(EngagementsPrivate).asJava) + + val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_0 = new Binary( + name("recap.engagement.is_good_clicked_convo_desc_uam_gt_0"), + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_1 = new Binary( + name("recap.engagement.is_good_clicked_convo_desc_uam_gt_1"), + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_2 = new Binary( + name("recap.engagement.is_good_clicked_convo_desc_uam_gt_2"), + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_3 = new Binary( + name("recap.engagement.is_good_clicked_convo_desc_uam_gt_3"), + Set(EngagementsPrivate, EngagementsPublic).asJava) + + val IS_TWEET_DETAIL_DWELLED = new Binary( + name("recap.engagement.is_tweet_detail_dwelled"), + Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_TWEET_DETAIL_DWELLED_8_SEC = new Binary( + name("recap.engagement.is_tweet_detail_dwelled_8_sec"), + Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_TWEET_DETAIL_DWELLED_15_SEC = new Binary( + name("recap.engagement.is_tweet_detail_dwelled_15_sec"), + Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_TWEET_DETAIL_DWELLED_25_SEC = new Binary( + name("recap.engagement.is_tweet_detail_dwelled_25_sec"), + Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_TWEET_DETAIL_DWELLED_30_SEC = new Binary( + name("recap.engagement.is_tweet_detail_dwelled_30_sec"), + Set(TweetsClicked, EngagementsPrivate).asJava) + + val IS_PROFILE_DWELLED = new Binary( + "recap.engagement.is_profile_dwelled", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_PROFILE_DWELLED_10_SEC = new Binary( + "recap.engagement.is_profile_dwelled_10_sec", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_PROFILE_DWELLED_20_SEC = new Binary( + "recap.engagement.is_profile_dwelled_20_sec", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_PROFILE_DWELLED_30_SEC = new Binary( + "recap.engagement.is_profile_dwelled_30_sec", + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + + val IS_FULLSCREEN_VIDEO_DWELLED = new Binary( + "recap.engagement.is_fullscreen_video_dwelled", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_FULLSCREEN_VIDEO_DWELLED_5_SEC = new Binary( + "recap.engagement.is_fullscreen_video_dwelled_5_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_FULLSCREEN_VIDEO_DWELLED_10_SEC = new Binary( + "recap.engagement.is_fullscreen_video_dwelled_10_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_FULLSCREEN_VIDEO_DWELLED_20_SEC = new Binary( + "recap.engagement.is_fullscreen_video_dwelled_20_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_FULLSCREEN_VIDEO_DWELLED_30_SEC = new Binary( + "recap.engagement.is_fullscreen_video_dwelled_30_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_LINK_DWELLED_15_SEC = new Binary( + "recap.engagement.is_link_dwelled_15_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_LINK_DWELLED_30_SEC = new Binary( + "recap.engagement.is_link_dwelled_30_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_LINK_DWELLED_60_SEC = new Binary( + "recap.engagement.is_link_dwelled_60_sec", + Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) + + val IS_QUOTED = new Binary( + name("recap.engagement.is_quoted"), + Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) + val IS_RETWEETED_WITHOUT_QUOTE = new Binary( + name("recap.engagement.is_retweeted_without_quote"), + Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) + val IS_CLICKED = + new Binary(name("recap.engagement.is_clicked"), Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_DWELLED = new Binary(name("recap.engagement.is_dwelled"), Set(EngagementsPrivate).asJava) + val IS_DWELLED_IN_BOUNDS_V1 = + new Binary(name("recap.engagement.is_dwelled_in_bounds_v1"), Set(EngagementsPrivate).asJava) + val DWELL_NORMALIZED_OVERALL = new Continuous( + name("recap.engagement.dwell_normalized_overall"), + Set(EngagementsPrivate).asJava) + val DWELL_CDF_OVERALL = + new Continuous(name("recap.engagement.dwell_cdf_overall"), Set(EngagementsPrivate).asJava) + val DWELL_CDF = new Continuous(name("recap.engagement.dwell_cdf"), Set(EngagementsPrivate).asJava) + + val IS_DWELLED_1S = + new Binary(name("recap.engagement.is_dwelled_1s"), Set(EngagementsPrivate).asJava) + val IS_DWELLED_2S = + new Binary(name("recap.engagement.is_dwelled_2s"), Set(EngagementsPrivate).asJava) + val IS_DWELLED_3S = + new Binary(name("recap.engagement.is_dwelled_3s"), Set(EngagementsPrivate).asJava) + val IS_DWELLED_4S = + new Binary(name("recap.engagement.is_dwelled_4s"), Set(EngagementsPrivate).asJava) + val IS_DWELLED_5S = + new Binary(name("recap.engagement.is_dwelled_5s"), Set(EngagementsPrivate).asJava) + val IS_DWELLED_6S = + new Binary(name("recap.engagement.is_dwelled_6s"), Set(EngagementsPrivate).asJava) + val IS_DWELLED_7S = + new Binary(name("recap.engagement.is_dwelled_7s"), Set(EngagementsPrivate).asJava) + val IS_DWELLED_8S = + new Binary(name("recap.engagement.is_dwelled_8s"), Set(EngagementsPrivate).asJava) + val IS_DWELLED_9S = + new Binary(name("recap.engagement.is_dwelled_9s"), Set(EngagementsPrivate).asJava) + val IS_DWELLED_10S = + new Binary(name("recap.engagement.is_dwelled_10s"), Set(EngagementsPrivate).asJava) + + val IS_SKIPPED_1S = + new Binary(name("recap.engagement.is_skipped_1s"), Set(EngagementsPrivate).asJava) + val IS_SKIPPED_2S = + new Binary(name("recap.engagement.is_skipped_2s"), Set(EngagementsPrivate).asJava) + val IS_SKIPPED_3S = + new Binary(name("recap.engagement.is_skipped_3s"), Set(EngagementsPrivate).asJava) + val IS_SKIPPED_4S = + new Binary(name("recap.engagement.is_skipped_4s"), Set(EngagementsPrivate).asJava) + val IS_SKIPPED_5S = + new Binary(name("recap.engagement.is_skipped_5s"), Set(EngagementsPrivate).asJava) + val IS_SKIPPED_6S = + new Binary(name("recap.engagement.is_skipped_6s"), Set(EngagementsPrivate).asJava) + val IS_SKIPPED_7S = + new Binary(name("recap.engagement.is_skipped_7s"), Set(EngagementsPrivate).asJava) + val IS_SKIPPED_8S = + new Binary(name("recap.engagement.is_skipped_8s"), Set(EngagementsPrivate).asJava) + val IS_SKIPPED_9S = + new Binary(name("recap.engagement.is_skipped_9s"), Set(EngagementsPrivate).asJava) + val IS_SKIPPED_10S = + new Binary(name("recap.engagement.is_skipped_10s"), Set(EngagementsPrivate).asJava) + + val IS_IMPRESSED = + new Binary(name("recap.engagement.is_impressed"), Set(EngagementsPrivate).asJava) + val IS_FOLLOWED = + new Binary("recap.engagement.is_followed", Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_PROFILE_CLICKED = new Binary( + name("recap.engagement.is_profile_clicked"), + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_OPEN_LINKED = new Binary( + name("recap.engagement.is_open_linked"), + Set(EngagementsPrivate, LinksClickedOn).asJava) + val IS_PHOTO_EXPANDED = + new Binary(name("recap.engagement.is_photo_expanded"), Set(EngagementsPrivate).asJava) + val IS_VIDEO_VIEWED = + new Binary(name("recap.engagement.is_video_viewed"), Set(EngagementsPrivate).asJava) + val IS_VIDEO_PLAYBACK_START = + new Binary(name("recap.engagement.is_video_playback_start"), Set(EngagementsPrivate).asJava) + val IS_VIDEO_PLAYBACK_25 = + new Binary(name("recap.engagement.is_video_playback_25"), Set(EngagementsPrivate).asJava) + val IS_VIDEO_PLAYBACK_50 = + new Binary(name("recap.engagement.is_video_playback_50"), Set(EngagementsPrivate).asJava) + val IS_VIDEO_PLAYBACK_75 = + new Binary(name("recap.engagement.is_video_playback_75"), Set(EngagementsPrivate).asJava) + val IS_VIDEO_PLAYBACK_95 = + new Binary(name("recap.engagement.is_video_playback_95"), Set(EngagementsPrivate).asJava) + val IS_VIDEO_PLAYBACK_COMPLETE = + new Binary(name("recap.engagement.is_video_playback_complete"), Set(EngagementsPrivate).asJava) + val IS_VIDEO_VIEWED_AND_PLAYBACK_50 = new Binary( + name("recap.engagement.is_video_viewed_and_playback_50"), + Set(EngagementsPrivate).asJava) + val IS_VIDEO_QUALITY_VIEWED = new Binary( + name("recap.engagement.is_video_quality_viewed"), + Set(EngagementsPrivate).asJava + ) + val IS_TWEET_SHARE_DM_CLICKED = + new Binary(name("recap.engagement.is_tweet_share_dm_clicked"), Set(EngagementsPrivate).asJava) + val IS_TWEET_SHARE_DM_SENT = + new Binary(name("recap.engagement.is_tweet_share_dm_sent"), Set(EngagementsPrivate).asJava) + val IS_BOOKMARKED = + new Binary(name("recap.engagement.is_bookmarked"), Set(EngagementsPrivate).asJava) + val IS_SHARED = + new Binary(name("recap.engagement.is_shared"), Set(EngagementsPrivate).asJava) + val IS_SHARE_MENU_CLICKED = + new Binary(name("recap.engagement.is_share_menu_clicked"), Set(EngagementsPrivate).asJava) + + // Negative engagements + val IS_DONT_LIKE = + new Binary(name("recap.engagement.is_dont_like"), Set(EngagementsPrivate).asJava) + val IS_BLOCK_CLICKED = new Binary( + name("recap.engagement.is_block_clicked"), + Set(TweetsClicked, EngagementsPrivate, EngagementsPublic).asJava) + val IS_BLOCK_DIALOG_BLOCKED = new Binary( + name("recap.engagement.is_block_dialog_blocked"), + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_MUTE_CLICKED = new Binary( + name("recap.engagement.is_mute_clicked"), + Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_MUTE_DIALOG_MUTED = + new Binary(name("recap.engagement.is_mute_dialog_muted"), Set(EngagementsPrivate).asJava) + val IS_REPORT_TWEET_CLICKED = new Binary( + name("recap.engagement.is_report_tweet_clicked"), + Set(TweetsClicked, EngagementsPrivate).asJava) + val IS_NEGATIVE_FEEDBACK = + new Binary("recap.engagement.is_negative_feedback", Set(EngagementsPrivate).asJava) + val IS_NOT_ABOUT_TOPIC = + new Binary(name("recap.engagement.is_not_about_topic"), Set(EngagementsPrivate).asJava) + val IS_NOT_RECENT = + new Binary(name("recap.engagement.is_not_recent"), Set(EngagementsPrivate).asJava) + val IS_NOT_RELEVANT = + new Binary(name("recap.engagement.is_not_relevant"), Set(EngagementsPrivate).asJava) + val IS_SEE_FEWER = + new Binary(name("recap.engagement.is_see_fewer"), Set(EngagementsPrivate).asJava) + val IS_TOPIC_SPEC_NEG_ENGAGEMENT = + new Binary("recap.engagement.is_topic_spec_neg_engagement", Set(EngagementsPrivate).asJava) + val IS_UNFOLLOW_TOPIC = + new Binary("recap.engagement.is_unfollow_topic", Set(EngagementsPrivate).asJava) + val IS_UNFOLLOW_TOPIC_EXPLICIT_POSITIVE_LABEL = + new Binary( + "recap.engagement.is_unfollow_topic_explicit_positive_label", + Set(EngagementsPrivate).asJava) + val IS_UNFOLLOW_TOPIC_IMPLICIT_POSITIVE_LABEL = + new Binary( + "recap.engagement.is_unfollow_topic_implicit_positive_label", + Set(EngagementsPrivate).asJava) + val IS_UNFOLLOW_TOPIC_STRONG_EXPLICIT_NEGATIVE_LABEL = + new Binary( + "recap.engagement.is_unfollow_topic_strong_explicit_negative_label", + Set(EngagementsPrivate).asJava) + val IS_UNFOLLOW_TOPIC_EXPLICIT_NEGATIVE_LABEL = + new Binary( + "recap.engagement.is_unfollow_topic_explicit_negative_label", + Set(EngagementsPrivate).asJava) + val IS_NOT_INTERESTED_IN = + new Binary("recap.engagement.is_not_interested_in", Set(EngagementsPrivate).asJava) + val IS_NOT_INTERESTED_IN_EXPLICIT_POSITIVE_LABEL = + new Binary( + "recap.engagement.is_not_interested_in_explicit_positive_label", + Set(EngagementsPrivate).asJava) + val IS_NOT_INTERESTED_IN_EXPLICIT_NEGATIVE_LABEL = + new Binary( + "recap.engagement.is_not_interested_in_explicit_negative_label", + Set(EngagementsPrivate).asJava) + val IS_CARET_CLICKED = + new Binary(name("recap.engagement.is_caret_clicked"), Set(EngagementsPrivate).asJava) + val IS_FOLLOW_TOPIC = + new Binary("recap.engagement.is_follow_topic", Set(EngagementsPrivate).asJava) + val IS_NOT_INTERESTED_IN_TOPIC = + new Binary("recap.engagement.is_not_interested_in_topic", Set(EngagementsPrivate).asJava) + val IS_HOME_LATEST_VISITED = + new Binary(name("recap.engagement.is_home_latest_visited"), Set(EngagementsPrivate).asJava) + + // Relevance prompt tweet engagements + val IS_RELEVANCE_PROMPT_YES_CLICKED = new Binary( + name("recap.engagement.is_relevance_prompt_yes_clicked"), + Set(EngagementsPrivate).asJava) + val IS_RELEVANCE_PROMPT_NO_CLICKED = new Binary( + name("recap.engagement.is_relevance_prompt_no_clicked"), + Set(EngagementsPrivate).asJava) + val IS_RELEVANCE_PROMPT_IMPRESSED = new Binary( + name("recap.engagement.is_relevance_prompt_impressed"), + Set(EngagementsPrivate).asJava) + + // Reciprocal engagements for reply forward engagement + val IS_REPLIED_REPLY_IMPRESSED_BY_AUTHOR = new Binary( + name("recap.engagement.is_replied_reply_impressed_by_author"), + Set(EngagementsPrivate).asJava) + val IS_REPLIED_REPLY_FAVORITED_BY_AUTHOR = new Binary( + name("recap.engagement.is_replied_reply_favorited_by_author"), + Set(EngagementsPrivate, EngagementsPublic, PrivateLikes, PublicLikes).asJava) + val IS_REPLIED_REPLY_QUOTED_BY_AUTHOR = new Binary( + name("recap.engagement.is_replied_reply_quoted_by_author"), + Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava) + val IS_REPLIED_REPLY_REPLIED_BY_AUTHOR = new Binary( + name("recap.engagement.is_replied_reply_replied_by_author"), + Set(EngagementsPrivate, EngagementsPublic, PrivateReplies, PublicReplies).asJava) + val IS_REPLIED_REPLY_RETWEETED_BY_AUTHOR = new Binary( + name("recap.engagement.is_replied_reply_retweeted_by_author"), + Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava) + val IS_REPLIED_REPLY_BLOCKED_BY_AUTHOR = new Binary( + name("recap.engagement.is_replied_reply_blocked_by_author"), + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_REPLIED_REPLY_FOLLOWED_BY_AUTHOR = new Binary( + name("recap.engagement.is_replied_reply_followed_by_author"), + Set(EngagementsPrivate, EngagementsPublic, Follow).asJava) + val IS_REPLIED_REPLY_UNFOLLOWED_BY_AUTHOR = new Binary( + name("recap.engagement.is_replied_reply_unfollowed_by_author"), + Set(EngagementsPrivate, EngagementsPublic).asJava) + val IS_REPLIED_REPLY_MUTED_BY_AUTHOR = new Binary( + name("recap.engagement.is_replied_reply_muted_by_author"), + Set(EngagementsPrivate).asJava) + val IS_REPLIED_REPLY_REPORTED_BY_AUTHOR = new Binary( + name("recap.engagement.is_replied_reply_reported_by_author"), + Set(EngagementsPrivate).asJava) + + // This derived label is the logical OR of REPLY_REPLIED, REPLY_FAVORITED, REPLY_RETWEETED + val IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR = new Binary( + name("recap.engagement.is_replied_reply_engaged_by_author"), + Set(EngagementsPrivate, EngagementsPublic).asJava) + + // Reciprocal engagements for fav forward engagement + val IS_FAVORITED_FAV_FAVORITED_BY_AUTHOR = new Binary( + name("recap.engagement.is_favorited_fav_favorited_by_author"), + Set(EngagementsPrivate, EngagementsPublic, PrivateLikes, PublicLikes).asJava + ) + val IS_FAVORITED_FAV_REPLIED_BY_AUTHOR = new Binary( + name("recap.engagement.is_favorited_fav_replied_by_author"), + Set(EngagementsPrivate, EngagementsPublic, PrivateReplies, PublicReplies).asJava + ) + val IS_FAVORITED_FAV_RETWEETED_BY_AUTHOR = new Binary( + name("recap.engagement.is_favorited_fav_retweeted_by_author"), + Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava + ) + val IS_FAVORITED_FAV_FOLLOWED_BY_AUTHOR = new Binary( + name("recap.engagement.is_favorited_fav_followed_by_author"), + Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava + ) + // This derived label is the logical OR of FAV_REPLIED, FAV_FAVORITED, FAV_RETWEETED, FAV_FOLLOWED + val IS_FAVORITED_FAV_ENGAGED_BY_AUTHOR = new Binary( + name("recap.engagement.is_favorited_fav_engaged_by_author"), + Set(EngagementsPrivate, EngagementsPublic).asJava) + + // define good profile click by considering following engagements (follow, fav, reply, retweet, etc.) at profile page + val IS_PROFILE_CLICKED_AND_PROFILE_FOLLOW = new Binary( + name("recap.engagement.is_profile_clicked_and_profile_follow"), + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, Follow).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_FAV = new Binary( + name("recap.engagement.is_profile_clicked_and_profile_fav"), + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, PrivateLikes, PublicLikes).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_REPLY = new Binary( + name("recap.engagement.is_profile_clicked_and_profile_reply"), + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, PrivateReplies, PublicReplies).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_RETWEET = new Binary( + name("recap.engagement.is_profile_clicked_and_profile_retweet"), + Set( + ProfilesViewed, + ProfilesClicked, + EngagementsPrivate, + PrivateRetweets, + PublicRetweets).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_TWEET_CLICK = new Binary( + name("recap.engagement.is_profile_clicked_and_profile_tweet_click"), + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, TweetsClicked).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_SHARE_DM_CLICK = new Binary( + name("recap.engagement.is_profile_clicked_and_profile_share_dm_click"), + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + // This derived label is the union of all binary features above + val IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED = new Binary( + name("recap.engagement.is_profile_clicked_and_profile_engaged"), + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, EngagementsPublic).asJava) + + // define bad profile click by considering following engagements (user report, tweet report, mute, block, etc) at profile page + val IS_PROFILE_CLICKED_AND_PROFILE_USER_REPORT_CLICK = new Binary( + name("recap.engagement.is_profile_clicked_and_profile_user_report_click"), + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_TWEET_REPORT_CLICK = new Binary( + name("recap.engagement.is_profile_clicked_and_profile_tweet_report_click"), + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_MUTE = new Binary( + name("recap.engagement.is_profile_clicked_and_profile_mute"), + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_PROFILE_CLICKED_AND_PROFILE_BLOCK = new Binary( + name("recap.engagement.is_profile_clicked_and_profile_block"), + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + // This derived label is the union of bad profile click engagements and existing negative feedback + val IS_NEGATIVE_FEEDBACK_V2 = new Binary( + name("recap.engagement.is_negative_feedback_v2"), + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_STRONG_NEGATIVE_FEEDBACK = new Binary( + name("recap.engagement.is_strong_negative_feedback"), + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + val IS_WEAK_NEGATIVE_FEEDBACK = new Binary( + name("recap.engagement.is_weak_negative_feedback"), + Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) + // engagement for following user from any surface area + val IS_FOLLOWED_FROM_ANY_SURFACE_AREA = new Binary( + "recap.engagement.is_followed_from_any_surface_area", + Set(EngagementsPublic, EngagementsPrivate).asJava) + + // Reply downvote engagements + val IS_REPLY_DOWNVOTED = + new Binary(name("recap.engagement.is_reply_downvoted"), Set(EngagementsPrivate).asJava) + val IS_REPLY_DOWNVOTE_REMOVED = + new Binary(name("recap.engagement.is_reply_downvote_removed"), Set(EngagementsPrivate).asJava) + + // Other engagements + val IS_GOOD_OPEN_LINK = new Binary( + name("recap.engagement.is_good_open_link"), + Set(EngagementsPrivate, LinksClickedOn).asJava) + val IS_ENGAGED = new Binary( + name("recap.engagement.any"), + Set(EngagementsPrivate, EngagementsPublic).asJava + ) // Deprecated - to be removed shortly + val IS_EARLYBIRD_UNIFIED_ENGAGEMENT = new Binary( + name("recap.engagement.is_unified_engagement"), + Set(EngagementsPrivate, EngagementsPublic).asJava + ) // A subset of IS_ENGAGED specifically intended for use in earlybird models + + // features from ThriftTweetFeatures + val PREV_USER_TWEET_ENGAGEMENT = new Continuous( + name("recap.tweetfeature.prev_user_tweet_enagagement"), + Set(EngagementScore, EngagementsPrivate, EngagementsPublic).asJava) + val IS_SENSITIVE = new Binary(name("recap.tweetfeature.is_sensitive")) + val HAS_MULTIPLE_MEDIA = new Binary( + name("recap.tweetfeature.has_multiple_media"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val IS_AUTHOR_PROFILE_EGG = new Binary(name("recap.tweetfeature.is_author_profile_egg")) + val IS_AUTHOR_NEW = + new Binary(name("recap.tweetfeature.is_author_new"), Set(UserState, UserType).asJava) + val NUM_MENTIONS = new Continuous( + name("recap.tweetfeature.num_mentions"), + Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava) + val HAS_MENTION = new Binary(name("recap.tweetfeature.has_mention"), Set(UserVisibleFlag).asJava) + val NUM_HASHTAGS = new Continuous( + name("recap.tweetfeature.num_hashtags"), + Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava) + val HAS_HASHTAG = new Binary( + name("recap.tweetfeature.has_hashtag"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val LINK_LANGUAGE = new Continuous( + name("recap.tweetfeature.link_language"), + Set(ProvidedLanguage, InferredLanguage).asJava) + val IS_AUTHOR_NSFW = + new Binary(name("recap.tweetfeature.is_author_nsfw"), Set(UserSafetyLabels, UserType).asJava) + val IS_AUTHOR_SPAM = + new Binary(name("recap.tweetfeature.is_author_spam"), Set(UserSafetyLabels, UserType).asJava) + val IS_AUTHOR_BOT = + new Binary(name("recap.tweetfeature.is_author_bot"), Set(UserSafetyLabels, UserType).asJava) + val SIGNATURE = + new Discrete(name("recap.tweetfeature.signature"), Set(DigitalSignatureNonrepudiation).asJava) + val LANGUAGE = new Discrete( + name("recap.tweetfeature.language"), + Set(ProvidedLanguage, InferredLanguage).asJava) + val FROM_INACTIVE_USER = + new Binary(name("recap.tweetfeature.from_inactive_user"), Set(UserActiveFlag).asJava) + val PROBABLY_FROM_FOLLOWED_AUTHOR = new Binary(name("recap.v3.tweetfeature.probably_from_follow")) + val FROM_MUTUAL_FOLLOW = new Binary(name("recap.tweetfeature.from_mutual_follow")) + val USER_REP = new Continuous(name("recap.tweetfeature.user_rep")) + val FROM_VERIFIED_ACCOUNT = + new Binary(name("recap.tweetfeature.from_verified_account"), Set(UserVerifiedFlag).asJava) + val IS_BUSINESS_SCORE = new Continuous(name("recap.tweetfeature.is_business_score")) + val HAS_CONSUMER_VIDEO = new Binary( + name("recap.tweetfeature.has_consumer_video"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_PRO_VIDEO = new Binary( + name("recap.tweetfeature.has_pro_video"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_VINE = new Binary( + name("recap.tweetfeature.has_vine"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_PERISCOPE = new Binary( + name("recap.tweetfeature.has_periscope"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_NATIVE_VIDEO = new Binary( + name("recap.tweetfeature.has_native_video"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_NATIVE_IMAGE = new Binary( + name("recap.tweetfeature.has_native_image"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_CARD = new Binary( + name("recap.tweetfeature.has_card"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_IMAGE = new Binary( + name("recap.tweetfeature.has_image"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_NEWS = new Binary( + name("recap.tweetfeature.has_news"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_VIDEO = new Binary( + name("recap.tweetfeature.has_video"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_VISIBLE_LINK = new Binary( + name("recap.tweetfeature.has_visible_link"), + Set(UrlFoundFlag, PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val LINK_COUNT = new Continuous( + name("recap.tweetfeature.link_count"), + Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava) + val HAS_LINK = new Binary( + name("recap.tweetfeature.has_link"), + Set(UrlFoundFlag, PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val IS_OFFENSIVE = new Binary(name("recap.tweetfeature.is_offensive")) + val HAS_TREND = new Binary( + name("recap.tweetfeature.has_trend"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val HAS_MULTIPLE_HASHTAGS_OR_TRENDS = new Binary( + name("recap.tweetfeature.has_multiple_hashtag_or_trend"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val URL_DOMAINS = new SparseBinary( + name("recap.tweetfeature.url_domains"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val CONTAINS_MEDIA = new Binary( + name("recap.tweetfeature.contains_media"), + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val RETWEET_SEARCHER = new Binary(name("recap.tweetfeature.retweet_searcher")) + val REPLY_SEARCHER = new Binary(name("recap.tweetfeature.reply_searcher")) + val MENTION_SEARCHER = + new Binary(name("recap.tweetfeature.mention_searcher"), Set(UserVisibleFlag).asJava) + val REPLY_OTHER = + new Binary(name("recap.tweetfeature.reply_other"), Set(PublicReplies, PrivateReplies).asJava) + val RETWEET_OTHER = new Binary( + name("recap.tweetfeature.retweet_other"), + Set(PublicRetweets, PrivateRetweets).asJava) + val IS_REPLY = + new Binary(name("recap.tweetfeature.is_reply"), Set(PublicReplies, PrivateReplies).asJava) + val IS_RETWEET = + new Binary(name("recap.tweetfeature.is_retweet"), Set(PublicRetweets, PrivateRetweets).asJava) + val IS_EXTENDED_REPLY = new Binary( + name("recap.tweetfeature.is_extended_reply"), + Set(PublicReplies, PrivateReplies).asJava) + val MATCH_UI_LANG = new Binary( + name("recap.tweetfeature.match_ui_lang"), + Set(ProvidedLanguage, InferredLanguage).asJava) + val MATCH_SEARCHER_MAIN_LANG = new Binary( + name("recap.tweetfeature.match_searcher_main_lang"), + Set(ProvidedLanguage, InferredLanguage).asJava) + val MATCH_SEARCHER_LANGS = new Binary( + name("recap.tweetfeature.match_searcher_langs"), + Set(ProvidedLanguage, InferredLanguage).asJava) + val BIDIRECTIONAL_REPLY_COUNT = new Continuous( + name("recap.tweetfeature.bidirectional_reply_count"), + Set(CountOfPrivateReplies, CountOfPublicReplies).asJava) + val UNIDIRECTIONAL_REPLY_COUNT = new Continuous( + name("recap.tweetfeature.unidirectional_reply_count"), + Set(CountOfPrivateReplies, CountOfPublicReplies).asJava) + val BIDIRECTIONAL_RETWEET_COUNT = new Continuous( + name("recap.tweetfeature.bidirectional_retweet_count"), + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) + val UNIDIRECTIONAL_RETWEET_COUNT = new Continuous( + name("recap.tweetfeature.unidirectional_retweet_count"), + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) + val BIDIRECTIONAL_FAV_COUNT = new Continuous( + name("recap.tweetfeature.bidirectional_fav_count"), + Set(CountOfPrivateLikes, CountOfPublicLikes).asJava) + val UNIDIRECTIONAL_FAV_COUNT = new Continuous( + name("recap.tweetfeature.unidirectiona_fav_count"), + Set(CountOfPrivateLikes, CountOfPublicLikes).asJava) + val CONVERSATIONAL_COUNT = new Continuous( + name("recap.tweetfeature.conversational_count"), + Set(CountOfPrivateTweets, CountOfPublicTweets).asJava) + // tweet impressions on an embedded tweet + val EMBEDS_IMPRESSION_COUNT = new Continuous( + name("recap.tweetfeature.embeds_impression_count"), + Set(CountOfImpression).asJava) + // number of URLs that embed the tweet + val EMBEDS_URL_COUNT = new Continuous( + name("recap.tweetfeature.embeds_url_count"), + Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava) + // currently only counts views on Snappy and Amplify pro videos. Counts for other videos forthcoming + val VIDEO_VIEW_COUNT = new Continuous( + name("recap.tweetfeature.video_view_count"), + Set( + CountOfTweetEntitiesClicked, + CountOfPrivateTweetEntitiesAndMetadata, + CountOfPublicTweetEntitiesAndMetadata, + EngagementsPrivate, + EngagementsPublic).asJava + ) + val TWEET_COUNT_FROM_USER_IN_SNAPSHOT = new Continuous( + name("recap.tweetfeature.tweet_count_from_user_in_snapshot"), + Set(CountOfPrivateTweets, CountOfPublicTweets).asJava) + val NORMALIZED_PARUS_SCORE = + new Continuous("recap.tweetfeature.normalized_parus_score", Set(EngagementScore).asJava) + val PARUS_SCORE = new Continuous("recap.tweetfeature.parus_score", Set(EngagementScore).asJava) + val REAL_GRAPH_WEIGHT = + new Continuous("recap.tweetfeature.real_graph_weight", Set(UsersRealGraphScore).asJava) + val SARUS_GRAPH_WEIGHT = new Continuous("recap.tweetfeature.sarus_graph_weight") + val TOPIC_SIM_SEARCHER_INTERSTED_IN_AUTHOR_KNOWN_FOR = new Continuous( + "recap.tweetfeature.topic_sim_searcher_interested_in_author_known_for") + val TOPIC_SIM_SEARCHER_AUTHOR_BOTH_INTERESTED_IN = new Continuous( + "recap.tweetfeature.topic_sim_searcher_author_both_interested_in") + val TOPIC_SIM_SEARCHER_AUTHOR_BOTH_KNOWN_FOR = new Continuous( + "recap.tweetfeature.topic_sim_searcher_author_both_known_for") + val TOPIC_SIM_SEARCHER_INTERESTED_IN_TWEET = new Continuous( + "recap.tweetfeature.topic_sim_searcher_interested_in_tweet") + val IS_RETWEETER_PROFILE_EGG = + new Binary(name("recap.v2.tweetfeature.is_retweeter_profile_egg"), Set(UserType).asJava) + val IS_RETWEETER_NEW = + new Binary(name("recap.v2.tweetfeature.is_retweeter_new"), Set(UserType, UserState).asJava) + val IS_RETWEETER_BOT = + new Binary( + name("recap.v2.tweetfeature.is_retweeter_bot"), + Set(UserType, UserSafetyLabels).asJava) + val IS_RETWEETER_NSFW = + new Binary( + name("recap.v2.tweetfeature.is_retweeter_nsfw"), + Set(UserType, UserSafetyLabels).asJava) + val IS_RETWEETER_SPAM = + new Binary( + name("recap.v2.tweetfeature.is_retweeter_spam"), + Set(UserType, UserSafetyLabels).asJava) + val RETWEET_OF_MUTUAL_FOLLOW = new Binary( + name("recap.v2.tweetfeature.retweet_of_mutual_follow"), + Set(PublicRetweets, PrivateRetweets).asJava) + val SOURCE_AUTHOR_REP = new Continuous(name("recap.v2.tweetfeature.source_author_rep")) + val IS_RETWEET_OF_REPLY = new Binary( + name("recap.v2.tweetfeature.is_retweet_of_reply"), + Set(PublicRetweets, PrivateRetweets).asJava) + val RETWEET_DIRECTED_AT_USER_IN_FIRST_DEGREE = new Binary( + name("recap.v2.tweetfeature.is_retweet_directed_at_user_in_first_degree"), + Set(PublicRetweets, PrivateRetweets, Follow).asJava) + val MENTIONED_SCREEN_NAMES = new SparseBinary( + "entities.users.mentioned_screen_names", + Set(DisplayName, UserVisibleFlag).asJava) + val MENTIONED_SCREEN_NAME = new Text( + "entities.users.mentioned_screen_names.member", + Set(DisplayName, UserVisibleFlag).asJava) + val HASHTAGS = new SparseBinary( + "entities.hashtags", + Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) + val URL_SLUGS = new SparseBinary(name("recap.linkfeature.url_slugs"), Set(UrlFoundFlag).asJava) + + // features from ThriftSearchResultMetadata + val REPLY_COUNT = new Continuous( + name("recap.searchfeature.reply_count"), + Set(CountOfPrivateReplies, CountOfPublicReplies).asJava) + val RETWEET_COUNT = new Continuous( + name("recap.searchfeature.retweet_count"), + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) + val FAV_COUNT = new Continuous( + name("recap.searchfeature.fav_count"), + Set(CountOfPrivateLikes, CountOfPublicLikes).asJava) + val BLENDER_SCORE = new Continuous(name("recap.searchfeature.blender_score")) + val TEXT_SCORE = new Continuous(name("recap.searchfeature.text_score")) + + // features related to content source + val SOURCE_TYPE = new Discrete(name("recap.source.type")) + + // features from addressbook + // the author is in the user's email addressbook + val USER_TO_AUTHOR_EMAIL_REACHABLE = + new Binary(name("recap.addressbook.user_to_author_email_reachable"), Set(AddressBook).asJava) + // the author is in the user's phone addressbook + val USER_TO_AUTHOR_PHONE_REACHABLE = + new Binary(name("recap.addressbook.user_to_author_phone_reachable"), Set(AddressBook).asJava) + // the user is in the author's email addressbook + val AUTHOR_TO_USER_EMAIL_REACHABLE = + new Binary(name("recap.addressbook.author_to_user_email_reachable"), Set(AddressBook).asJava) + // the user is in the user's phone addressbook + val AUTHOR_TO_USER_PHONE_REACHABLE = + new Binary(name("recap.addressbook.author_to_user_phone_reachable"), Set(AddressBook).asJava) + + // predicted engagement (these features are used by prediction service to return the predicted engagement probability) + // these should match the names in engagement_to_score_feature_mapping + val PREDICTED_IS_FAVORITED = + new Continuous(name("recap.engagement_predicted.is_favorited"), Set(EngagementScore).asJava) + val PREDICTED_IS_RETWEETED = + new Continuous(name("recap.engagement_predicted.is_retweeted"), Set(EngagementScore).asJava) + val PREDICTED_IS_QUOTED = + new Continuous(name("recap.engagement_predicted.is_quoted"), Set(EngagementScore).asJava) + val PREDICTED_IS_REPLIED = + new Continuous(name("recap.engagement_predicted.is_replied"), Set(EngagementScore).asJava) + val PREDICTED_IS_GOOD_OPEN_LINK = new Continuous( + name("recap.engagement_predicted.is_good_open_link"), + Set(EngagementScore).asJava) + val PREDICTED_IS_PROFILE_CLICKED = new Continuous( + name("recap.engagement_predicted.is_profile_clicked"), + Set(EngagementScore).asJava) + val PREDICTED_IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED = new Continuous( + name("recap.engagement_predicted.is_profile_clicked_and_profile_engaged"), + Set(EngagementScore).asJava) + val PREDICTED_IS_CLICKED = + new Continuous(name("recap.engagement_predicted.is_clicked"), Set(EngagementScore).asJava) + val PREDICTED_IS_PHOTO_EXPANDED = new Continuous( + name("recap.engagement_predicted.is_photo_expanded"), + Set(EngagementScore).asJava) + val PREDICTED_IS_DONT_LIKE = + new Continuous(name("recap.engagement_predicted.is_dont_like"), Set(EngagementScore).asJava) + val PREDICTED_IS_VIDEO_PLAYBACK_50 = new Continuous( + name("recap.engagement_predicted.is_video_playback_50"), + Set(EngagementScore).asJava) + val PREDICTED_IS_VIDEO_QUALITY_VIEWED = new Continuous( + name("recap.engagement_predicted.is_video_quality_viewed"), + Set(EngagementScore).asJava) + val PREDICTED_IS_BOOKMARKED = + new Continuous(name("recap.engagement_predicted.is_bookmarked"), Set(EngagementScore).asJava) + val PREDICTED_IS_SHARED = + new Continuous(name("recap.engagement_predicted.is_shared"), Set(EngagementScore).asJava) + val PREDICTED_IS_SHARE_MENU_CLICKED = + new Continuous( + name("recap.engagement_predicted.is_share_menu_clicked"), + Set(EngagementScore).asJava) + val PREDICTED_IS_PROFILE_DWELLED_20_SEC = new Continuous( + name("recap.engagement_predicted.is_profile_dwelled_20_sec"), + Set(EngagementScore).asJava) + val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_5_SEC = new Continuous( + name("recap.engagement_predicted.is_fullscreen_video_dwelled_5_sec"), + Set(EngagementScore).asJava) + val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_10_SEC = new Continuous( + name("recap.engagement_predicted.is_fullscreen_video_dwelled_10_sec"), + Set(EngagementScore).asJava) + val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_20_SEC = new Continuous( + name("recap.engagement_predicted.is_fullscreen_video_dwelled_20_sec"), + Set(EngagementScore).asJava) + val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_30_SEC = new Continuous( + name("recap.engagement_predicted.is_fullscreen_video_dwelled_30_sec"), + Set(EngagementScore).asJava) + val PREDICTED_IS_UNIFIED_ENGAGEMENT = new Continuous( + name("recap.engagement_predicted.is_unified_engagement"), + Set(EngagementScore).asJava) + val PREDICTED_IS_COMPOSE_TRIGGERED = new Continuous( + name("recap.engagement_predicted.is_compose_triggered"), + Set(EngagementScore).asJava) + val PREDICTED_IS_REPLIED_REPLY_IMPRESSED_BY_AUTHOR = new Continuous( + name("recap.engagement_predicted.is_replied_reply_impressed_by_author"), + Set(EngagementScore).asJava) + val PREDICTED_IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR = new Continuous( + name("recap.engagement_predicted.is_replied_reply_engaged_by_author"), + Set(EngagementScore).asJava) + val PREDICTED_IS_GOOD_CLICKED_V1 = new Continuous( + name("recap.engagement_predicted.is_good_clicked_convo_desc_favorited_or_replied"), + Set(EngagementScore).asJava) + val PREDICTED_IS_GOOD_CLICKED_V2 = new Continuous( + name("recap.engagement_predicted.is_good_clicked_convo_desc_v2"), + Set(EngagementScore).asJava) + val PREDICTED_IS_TWEET_DETAIL_DWELLED_8_SEC = new Continuous( + name("recap.engagement_predicted.is_tweet_detail_dwelled_8_sec"), + Set(EngagementScore).asJava) + val PREDICTED_IS_TWEET_DETAIL_DWELLED_15_SEC = new Continuous( + name("recap.engagement_predicted.is_tweet_detail_dwelled_15_sec"), + Set(EngagementScore).asJava) + val PREDICTED_IS_TWEET_DETAIL_DWELLED_25_SEC = new Continuous( + name("recap.engagement_predicted.is_tweet_detail_dwelled_25_sec"), + Set(EngagementScore).asJava) + val PREDICTED_IS_TWEET_DETAIL_DWELLED_30_SEC = new Continuous( + name("recap.engagement_predicted.is_tweet_detail_dwelled_30_sec"), + Set(EngagementScore).asJava) + val PREDICTED_IS_FAVORITED_FAV_ENGAGED_BY_AUTHOR = new Continuous( + name("recap.engagement_predicted.is_favorited_fav_engaged_by_author"), + Set(EngagementScore).asJava) + val PREDICTED_IS_GOOD_CLICKED_WITH_DWELL_SUM_GTE_60S = new Continuous( + name( + "recap.engagement_predicted.is_good_clicked_convo_desc_favorited_or_replied_or_dwell_sum_gte_60_secs"), + Set(EngagementScore).asJava) + val PREDICTED_IS_DWELLED_IN_BOUNDS_V1 = new Continuous( + name("recap.engagement_predicted.is_dwelled_in_bounds_v1"), + Set(EngagementScore).asJava) + val PREDICTED_DWELL_NORMALIZED_OVERALL = new Continuous( + name("recap.engagement_predicted.dwell_normalized_overall"), + Set(EngagementScore).asJava) + val PREDICTED_DWELL_CDF = + new Continuous(name("recap.engagement_predicted.dwell_cdf"), Set(EngagementScore).asJava) + val PREDICTED_DWELL_CDF_OVERALL = new Continuous( + name("recap.engagement_predicted.dwell_cdf_overall"), + Set(EngagementScore).asJava) + val PREDICTED_IS_DWELLED = + new Continuous(name("recap.engagement_predicted.is_dwelled"), Set(EngagementScore).asJava) + + val PREDICTED_IS_DWELLED_1S = + new Continuous(name("recap.engagement_predicted.is_dwelled_1s"), Set(EngagementScore).asJava) + val PREDICTED_IS_DWELLED_2S = + new Continuous(name("recap.engagement_predicted.is_dwelled_2s"), Set(EngagementScore).asJava) + val PREDICTED_IS_DWELLED_3S = + new Continuous(name("recap.engagement_predicted.is_dwelled_3s"), Set(EngagementScore).asJava) + val PREDICTED_IS_DWELLED_4S = + new Continuous(name("recap.engagement_predicted.is_dwelled_4s"), Set(EngagementScore).asJava) + val PREDICTED_IS_DWELLED_5S = + new Continuous(name("recap.engagement_predicted.is_dwelled_5s"), Set(EngagementScore).asJava) + val PREDICTED_IS_DWELLED_6S = + new Continuous(name("recap.engagement_predicted.is_dwelled_6s"), Set(EngagementScore).asJava) + val PREDICTED_IS_DWELLED_7S = + new Continuous(name("recap.engagement_predicted.is_dwelled_7s"), Set(EngagementScore).asJava) + val PREDICTED_IS_DWELLED_8S = + new Continuous(name("recap.engagement_predicted.is_dwelled_8s"), Set(EngagementScore).asJava) + val PREDICTED_IS_DWELLED_9S = + new Continuous(name("recap.engagement_predicted.is_dwelled_9s"), Set(EngagementScore).asJava) + val PREDICTED_IS_DWELLED_10S = + new Continuous(name("recap.engagement_predicted.is_dwelled_10s"), Set(EngagementScore).asJava) + + val PREDICTED_IS_SKIPPED_1S = + new Continuous(name("recap.engagement_predicted.is_skipped_1s"), Set(EngagementScore).asJava) + val PREDICTED_IS_SKIPPED_2S = + new Continuous(name("recap.engagement_predicted.is_skipped_2s"), Set(EngagementScore).asJava) + val PREDICTED_IS_SKIPPED_3S = + new Continuous(name("recap.engagement_predicted.is_skipped_3s"), Set(EngagementScore).asJava) + val PREDICTED_IS_SKIPPED_4S = + new Continuous(name("recap.engagement_predicted.is_skipped_4s"), Set(EngagementScore).asJava) + val PREDICTED_IS_SKIPPED_5S = + new Continuous(name("recap.engagement_predicted.is_skipped_5s"), Set(EngagementScore).asJava) + val PREDICTED_IS_SKIPPED_6S = + new Continuous(name("recap.engagement_predicted.is_skipped_6s"), Set(EngagementScore).asJava) + val PREDICTED_IS_SKIPPED_7S = + new Continuous(name("recap.engagement_predicted.is_skipped_7s"), Set(EngagementScore).asJava) + val PREDICTED_IS_SKIPPED_8S = + new Continuous(name("recap.engagement_predicted.is_skipped_8s"), Set(EngagementScore).asJava) + val PREDICTED_IS_SKIPPED_9S = + new Continuous(name("recap.engagement_predicted.is_skipped_9s"), Set(EngagementScore).asJava) + val PREDICTED_IS_SKIPPED_10S = + new Continuous(name("recap.engagement_predicted.is_skipped_10s"), Set(EngagementScore).asJava) + + val PREDICTED_IS_HOME_LATEST_VISITED = new Continuous( + name("recap.engagement_predicted.is_home_latest_visited"), + Set(EngagementScore).asJava) + val PREDICTED_IS_NEGATIVE_FEEDBACK = + new Continuous( + name("recap.engagement_predicted.is_negative_feedback"), + Set(EngagementScore).asJava) + val PREDICTED_IS_NEGATIVE_FEEDBACK_V2 = + new Continuous( + name("recap.engagement_predicted.is_negative_feedback_v2"), + Set(EngagementScore).asJava) + val PREDICTED_IS_WEAK_NEGATIVE_FEEDBACK = + new Continuous( + name("recap.engagement_predicted.is_weak_negative_feedback"), + Set(EngagementScore).asJava) + val PREDICTED_IS_STRONG_NEGATIVE_FEEDBACK = + new Continuous( + name("recap.engagement_predicted.is_strong_negative_feedback"), + Set(EngagementScore).asJava) + val PREDICTED_IS_REPORT_TWEET_CLICKED = + new Continuous( + name("recap.engagement_predicted.is_report_tweet_clicked"), + Set(EngagementScore).asJava) + val PREDICTED_IS_UNFOLLOW_TOPIC = + new Continuous( + name("recap.engagement_predicted.is_unfollow_topic"), + Set(EngagementScore).asJava) + val PREDICTED_IS_RELEVANCE_PROMPT_YES_CLICKED = new Continuous( + name("recap.engagement_predicted.is_relevance_prompt_yes_clicked"), + Set(EngagementScore).asJava) + + // engagement for following user from any surface area + val PREDICTED_IS_FOLLOWED_FROM_ANY_SURFACE_AREA = new Continuous( + "recap.engagement_predicted.is_followed_from_any_surface_area", + Set(EngagementScore).asJava) + + + // These are global engagement counts for the Tweets. + val FAV_COUNT_V2 = new Continuous( + name("recap.earlybird.fav_count_v2"), + Set(CountOfPrivateLikes, CountOfPublicLikes).asJava) + val RETWEET_COUNT_V2 = new Continuous( + name("recap.earlybird.retweet_count_v2"), + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) + val REPLY_COUNT_V2 = new Continuous( + name("recap.earlybird.reply_count_v2"), + Set(CountOfPrivateReplies, CountOfPublicReplies).asJava) + + val HAS_US_POLITICAL_ANNOTATION = new Binary( + name("recap.has_us_political_annotation"), + Set(SemanticcoreClassification).asJava + ) + + val HAS_US_POLITICAL_ALL_GROUPS_ANNOTATION = new Binary( + name("recap.has_us_political_all_groups_annotation"), + Set(SemanticcoreClassification).asJava + ) + + val HAS_US_POLITICAL_ANNOTATION_HIGH_RECALL = new Binary( + name("recap.has_us_political_annotation_high_recall"), + Set(SemanticcoreClassification).asJava + ) + + val HAS_US_POLITICAL_ANNOTATION_HIGH_RECALL_V2 = new Binary( + name("recap.has_us_political_annotation_high_recall_v2"), + Set(SemanticcoreClassification).asJava + ) + + val HAS_US_POLITICAL_ANNOTATION_HIGH_PRECISION_V0 = new Binary( + name("recap.has_us_political_annotation_high_precision_v0"), + Set(SemanticcoreClassification).asJava + ) + + val HAS_US_POLITICAL_ANNOTATION_BALANCED_PRECISION_RECALL_V0 = new Binary( + name("recap.has_us_political_annotation_balanced_precision_recall_v0"), + Set(SemanticcoreClassification).asJava + ) + + val HAS_US_POLITICAL_ANNOTATION_HIGH_RECALL_V3 = new Binary( + name("recap.has_us_political_annotation_high_recall_v3"), + Set(SemanticcoreClassification).asJava + ) + + val HAS_US_POLITICAL_ANNOTATION_HIGH_PRECISION_V3 = new Binary( + name("recap.has_us_political_annotation_high_precision_v3"), + Set(SemanticcoreClassification).asJava + ) + + val HAS_US_POLITICAL_ANNOTATION_BALANCED_V3 = new Binary( + name("recap.has_us_political_annotation_balanced_v3"), + Set(SemanticcoreClassification).asJava + ) + +} diff --git a/src/scala/com/twitter/timelines/prediction/features/recap/RecapFeaturesUtils.scala b/src/scala/com/twitter/timelines/prediction/features/recap/RecapFeaturesUtils.scala new file mode 100644 index 0000000000..edf152cda8 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/recap/RecapFeaturesUtils.scala @@ -0,0 +1,29 @@ +package com.twitter.timelines.prediction.features.recap + +object RecapFeaturesUtils { + // This needs to be updated if an engagement model is added or removed from prediction service. + val scoreFeatureIdsMap: Map[String, Long] = Map( + RecapFeatures.IS_FAVORITED.getFeatureName -> RecapFeatures.PREDICTED_IS_FAVORITED.getFeatureId, + RecapFeatures.IS_REPLIED.getFeatureName -> RecapFeatures.PREDICTED_IS_REPLIED.getFeatureId, + RecapFeatures.IS_RETWEETED.getFeatureName -> RecapFeatures.PREDICTED_IS_RETWEETED.getFeatureId, + RecapFeatures.IS_GOOD_CLICKED_CONVO_DESC_V1.getFeatureName -> RecapFeatures.PREDICTED_IS_GOOD_CLICKED_V1.getFeatureId, + RecapFeatures.IS_GOOD_CLICKED_CONVO_DESC_V2.getFeatureName -> RecapFeatures.PREDICTED_IS_GOOD_CLICKED_V2.getFeatureId, +// RecapFeatures.IS_NEGATIVE_FEEDBACK_V2.getFeatureName -> RecapFeatures.PREDICTED_IS_NEGATIVE_FEEDBACK_V2.getFeatureId, + RecapFeatures.IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED.getFeatureName -> RecapFeatures.PREDICTED_IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED.getFeatureId, + RecapFeatures.IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR.getFeatureName -> RecapFeatures.PREDICTED_IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR.getFeatureId + ) + + // This needs to be updated if an engagement model is added or removed from prediction service. + val labelFeatureIdToScoreFeatureIdsMap: Map[Long, Long] = Map( + RecapFeatures.IS_FAVORITED.getFeatureId -> RecapFeatures.PREDICTED_IS_FAVORITED.getFeatureId, + RecapFeatures.IS_REPLIED.getFeatureId -> RecapFeatures.PREDICTED_IS_REPLIED.getFeatureId, + RecapFeatures.IS_RETWEETED.getFeatureId -> RecapFeatures.PREDICTED_IS_RETWEETED.getFeatureId, + RecapFeatures.IS_GOOD_CLICKED_CONVO_DESC_V1.getFeatureId -> RecapFeatures.PREDICTED_IS_GOOD_CLICKED_V1.getFeatureId, + RecapFeatures.IS_GOOD_CLICKED_CONVO_DESC_V2.getFeatureId -> RecapFeatures.PREDICTED_IS_GOOD_CLICKED_V2.getFeatureId, + // RecapFeatures.IS_NEGATIVE_FEEDBACK_V2.getFeatureName -> RecapFeatures.PREDICTED_IS_NEGATIVE_FEEDBACK_V2.getFeatureId, + RecapFeatures.IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED.getFeatureId -> RecapFeatures.PREDICTED_IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED.getFeatureId, + RecapFeatures.IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR.getFeatureId -> RecapFeatures.PREDICTED_IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR.getFeatureId + ) + + val labelFeatureNames: Seq[String] = scoreFeatureIdsMap.keys.toSeq +} diff --git a/src/scala/com/twitter/timelines/prediction/features/request_context/BUILD b/src/scala/com/twitter/timelines/prediction/features/request_context/BUILD new file mode 100644 index 0000000000..6fc497bf3d --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/request_context/BUILD @@ -0,0 +1,9 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "src/java/com/twitter/ml/api:api-base", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + ], +) diff --git a/src/scala/com/twitter/timelines/prediction/features/request_context/RequestContextFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/request_context/RequestContextFeatures.scala new file mode 100644 index 0000000000..a7dd28852c --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/request_context/RequestContextFeatures.scala @@ -0,0 +1,57 @@ +package com.twitter.timelines.prediction.features.request_context + +import com.twitter.ml.api.FeatureContext +import com.twitter.ml.api.Feature._ +import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ +import scala.collection.JavaConverters._ + +object RequestContextFeatures { + val COUNTRY_CODE = + new Text("request_context.country_code", Set(PrivateCountryOrRegion, InferredCountry).asJava) + val LANGUAGE_CODE = new Text( + "request_context.language_code", + Set(GeneralSettings, ProvidedLanguage, InferredLanguage).asJava) + val REQUEST_PROVENANCE = new Text("request_context.request_provenance", Set(AppUsage).asJava) + val DISPLAY_WIDTH = new Continuous("request_context.display_width", Set(OtherDeviceInfo).asJava) + val DISPLAY_HEIGHT = new Continuous("request_context.display_height", Set(OtherDeviceInfo).asJava) + val DISPLAY_DPI = new Continuous("request_context.display_dpi", Set(OtherDeviceInfo).asJava) + + // the following features are not Continuous Features because for e.g. continuity between + // 23 and 0 hours cannot be handled that way. instead, we will treat each slice of hours/days + // independently, like a set of sparse binary features. + val TIMESTAMP_GMT_HOUR = + new Discrete("request_context.timestamp_gmt_hour", Set(PrivateTimestamp).asJava) + val TIMESTAMP_GMT_DOW = + new Discrete("request_context.timestamp_gmt_dow", Set(PrivateTimestamp).asJava) + + val IS_GET_INITIAL = new Binary("request_context.is_get_initial") + val IS_GET_MIDDLE = new Binary("request_context.is_get_middle") + val IS_GET_NEWER = new Binary("request_context.is_get_newer") + val IS_GET_OLDER = new Binary("request_context.is_get_older") + + // the following features are not Binary Features because the source field is Option[Boolean], + // and we want to distinguish Some(false) from None. None will be converted to -1. + val IS_POLLING = new Discrete("request_context.is_polling") + val IS_SESSION_START = new Discrete("request_context.is_session_start") + + // Helps distinguish requests from "home" vs "home_latest" (reverse chron home view). + val TIMELINE_KIND = new Text("request_context.timeline_kind") + + val featureContext = new FeatureContext( + COUNTRY_CODE, + LANGUAGE_CODE, + REQUEST_PROVENANCE, + DISPLAY_WIDTH, + DISPLAY_HEIGHT, + DISPLAY_DPI, + TIMESTAMP_GMT_HOUR, + TIMESTAMP_GMT_DOW, + IS_GET_INITIAL, + IS_GET_MIDDLE, + IS_GET_NEWER, + IS_GET_OLDER, + IS_POLLING, + IS_SESSION_START, + TIMELINE_KIND + ) +} diff --git a/src/scala/com/twitter/timelines/prediction/features/simcluster/BUILD b/src/scala/com/twitter/timelines/prediction/features/simcluster/BUILD new file mode 100644 index 0000000000..ec194353b6 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/simcluster/BUILD @@ -0,0 +1,13 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "src/java/com/twitter/ml/api:api-base", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", + "src/thrift/com/twitter/timelines/suggests/common:record-scala", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + "timelines/data_processing/ml_util/aggregation_framework/conversion:for-timelines", + ], +) diff --git a/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclusterFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclusterFeatures.scala new file mode 100644 index 0000000000..4d2b4db817 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclusterFeatures.scala @@ -0,0 +1,61 @@ +package com.twitter.timelines.prediction.features.simcluster + +import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ +import com.twitter.finagle.stats.StatsReceiver +import com.twitter.ml.api.Feature._ +import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup +import scala.collection.JavaConverters._ + +class SimclusterFeaturesHelper(statsReceiver: StatsReceiver) { + import SimclusterFeatures._ + + private[this] val scopedStatsReceiver = statsReceiver.scope(getClass.getSimpleName) + private[this] val invalidSimclusterModelVersion = scopedStatsReceiver + .counter("invalidSimclusterModelVersion") + + def fromUserClusterInterestsPair( + userInterestClustersPair: (Long, ClustersUserIsInterestedIn) + ): Option[SimclusterFeatures] = { + val (userId, userInterestClusters) = userInterestClustersPair + if (userInterestClusters.knownForModelVersion == SIMCLUSTER_MODEL_VERSION) { + val userInterestClustersFavScores = for { + (clusterId, scores) <- userInterestClusters.clusterIdToScores + favScore <- scores.favScore + } yield (clusterId.toString, favScore) + Some( + SimclusterFeatures( + userId, + userInterestClusters.knownForModelVersion, + userInterestClustersFavScores.toMap + ) + ) + } else { + // We maintain this counter to make sure that the hardcoded modelVersion we are using is correct. + invalidSimclusterModelVersion.incr + None + } + } +} + +object SimclusterFeatures { + // Check http://go/simclustersv2runbook for production versions + // Our models are trained for this specific model version only. + val SIMCLUSTER_MODEL_VERSION = "20M_145K_dec11" + val prefix = s"simcluster.v2.$SIMCLUSTER_MODEL_VERSION" + + val SIMCLUSTER_USER_INTEREST_CLUSTER_SCORES = new SparseContinuous( + s"$prefix.user_interest_cluster_scores", + Set(EngagementScore, InferredInterests).asJava + ) + val SIMCLUSTER_USER_INTEREST_CLUSTER_IDS = new SparseBinary( + s"$prefix.user_interest_cluster_ids", + Set(InferredInterests).asJava + ) + val SIMCLUSTER_MODEL_VERSION_METADATA = new Text("meta.simcluster_version") +} + +case class SimclusterFeatures( + userId: Long, + modelVersion: String, + interestClusterScoresMap: Map[String, Double]) diff --git a/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclusterTweetFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclusterTweetFeatures.scala new file mode 100644 index 0000000000..355a89c22d --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclusterTweetFeatures.scala @@ -0,0 +1,150 @@ +package com.twitter.timelines.prediction.features.simcluster + +import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ +import com.twitter.finagle.stats.StatsReceiver +import com.twitter.ml.api.{Feature, FeatureContext} +import com.twitter.ml.api.Feature.{Continuous, SparseBinary, SparseContinuous} +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion._ +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup +import com.twitter.timelines.suggests.common.record.thriftscala.SuggestionRecord +import scala.collection.JavaConverters._ + +class SimclusterTweetFeatures(statsReceiver: StatsReceiver) extends CombineCountsBase { + import SimclusterTweetFeatures._ + + private[this] val scopedStatsReceiver = statsReceiver.scope(getClass.getSimpleName) + private[this] val invalidSimclusterModelVersion = scopedStatsReceiver + .counter("invalidSimclusterModelVersion") + private[this] val getFeaturesFromOverlappingSimclusterIdsCount = scopedStatsReceiver + .counter("getFeaturesFromOverlappingSimclusterIdsCount") + private[this] val emptySimclusterMaps = scopedStatsReceiver + .counter("emptySimclusterMaps") + private[this] val nonOverlappingSimclusterMaps = scopedStatsReceiver + .counter("nonOverlappingSimclusterMaps") + + // Parameters required by CombineCountsBase + override val topK: Int = 5 + override val hardLimit: Option[Int] = None + override val precomputedCountFeatures: Seq[Feature[_]] = Seq( + SIMCLUSTER_TWEET_TOPK_SORT_BY_TWEET_SCORE, + SIMCLUSTER_TWEET_TOPK_SORT_BY_COMBINED_SCORE + ) + + private def getFeaturesFromOverlappingSimclusterIds( + userSimclustersInterestedInMap: Map[String, Double], + tweetSimclustersTopKMap: Map[String, Double] + ): Map[Feature[_], List[Double]] = { + getFeaturesFromOverlappingSimclusterIdsCount.incr + if (userSimclustersInterestedInMap.isEmpty || tweetSimclustersTopKMap.isEmpty) { + emptySimclusterMaps.incr + Map.empty + } else { + val overlappingSimclusterIds = + userSimclustersInterestedInMap.keySet intersect tweetSimclustersTopKMap.keySet + if (overlappingSimclusterIds.isEmpty) { + nonOverlappingSimclusterMaps.incr + Map.empty + } else { + val (combinedScores, tweetScores) = overlappingSimclusterIds.map { id => + val tweetScore = tweetSimclustersTopKMap.getOrElse(id, 0.0) + val combinedScore = userSimclustersInterestedInMap.getOrElse(id, 0.0) * tweetScore + (combinedScore, tweetScore) + }.unzip + Map( + SIMCLUSTER_TWEET_TOPK_SORT_BY_COMBINED_SCORE -> combinedScores.toList, + SIMCLUSTER_TWEET_TOPK_SORT_BY_TWEET_SCORE -> tweetScores.toList + ) + } + } + } + + def getCountFeaturesValuesMap( + suggestionRecord: SuggestionRecord, + simclustersTweetTopKMap: Map[String, Double] + ): Map[Feature[_], List[Double]] = { + val userSimclustersInterestedInMap = formatUserSimclustersInterestedIn(suggestionRecord) + + val tweetSimclustersTopKMap = formatTweetSimclustersTopK(simclustersTweetTopKMap) + + getFeaturesFromOverlappingSimclusterIds(userSimclustersInterestedInMap, tweetSimclustersTopKMap) + } + + def filterByModelVersion( + simclustersMapOpt: Option[Map[String, Double]] + ): Option[Map[String, Double]] = { + simclustersMapOpt.flatMap { simclustersMap => + val filteredSimclustersMap = simclustersMap.filter { + case (clusterId, score) => + // The clusterId format is ModelVersion.IntegerClusterId.ScoreType as specified at + // com.twitter.ml.featurestore.catalog.features.recommendations.SimClustersV2TweetTopClusters + clusterId.contains(SimclusterFeatures.SIMCLUSTER_MODEL_VERSION) + } + + // The assumption is that the simclustersMap will contain clusterIds with the same modelVersion. + // We maintain this counter to make sure that the hardcoded modelVersion we are using is correct. + if (simclustersMap.size > filteredSimclustersMap.size) { + invalidSimclusterModelVersion.incr + } + + if (filteredSimclustersMap.nonEmpty) Some(filteredSimclustersMap) else None + } + } + + val allFeatures: Seq[Feature[_]] = outputFeaturesPostMerge.toSeq ++ Seq( + SIMCLUSTER_TWEET_TOPK_CLUSTER_IDS, + SIMCLUSTER_TWEET_TOPK_CLUSTER_SCORES) + val featureContext = new FeatureContext(allFeatures: _*) +} + +object SimclusterTweetFeatures { + val SIMCLUSTER_TWEET_TOPK_CLUSTER_IDS = new SparseBinary( + s"${SimclusterFeatures.prefix}.tweet_topk_cluster_ids", + Set(InferredInterests).asJava + ) + val SIMCLUSTER_TWEET_TOPK_CLUSTER_SCORES = new SparseContinuous( + s"${SimclusterFeatures.prefix}.tweet_topk_cluster_scores", + Set(EngagementScore, InferredInterests).asJava + ) + + val SIMCLUSTER_TWEET_TOPK_CLUSTER_ID = + TypedAggregateGroup.sparseFeature(SIMCLUSTER_TWEET_TOPK_CLUSTER_IDS) + + val SIMCLUSTER_TWEET_TOPK_SORT_BY_TWEET_SCORE = new Continuous( + s"${SimclusterFeatures.prefix}.tweet_topk_sort_by_tweet_score", + Set(EngagementScore, InferredInterests).asJava + ) + + val SIMCLUSTER_TWEET_TOPK_SORT_BY_COMBINED_SCORE = new Continuous( + s"${SimclusterFeatures.prefix}.tweet_topk_sort_by_combined_score", + Set(EngagementScore, InferredInterests).asJava + ) + + def formatUserSimclustersInterestedIn(suggestionRecord: SuggestionRecord): Map[String, Double] = { + suggestionRecord.userSimclustersInterestedIn + .map { clustersUserIsInterestedIn => + if (clustersUserIsInterestedIn.knownForModelVersion == SimclusterFeatures.SIMCLUSTER_MODEL_VERSION) { + clustersUserIsInterestedIn.clusterIdToScores.collect { + case (clusterId, scores) if scores.favScore.isDefined => + (clusterId.toString, scores.favScore.get) + } + } else Map.empty[String, Double] + }.getOrElse(Map.empty[String, Double]) + .toMap + } + + def formatTweetSimclustersTopK( + simclustersTweetTopKMap: Map[String, Double] + ): Map[String, Double] = { + simclustersTweetTopKMap.collect { + case (clusterId, score) => + // The clusterId format is as specified at + // com.twitter.ml.featurestore.catalog.features.recommendations.SimClustersV2TweetTopClusters + // and we want to extract the IntegerClusterId. + // The split function takes a regex; therefore, we need to escape . and we also need to escape + // \ since they are both special characters. Hence, the double \\. + val clusterIdSplit = clusterId.split("\\.") + val integerClusterId = clusterIdSplit(1) // The IntegerClusterId is at position 1. + (integerClusterId, score) + } + } +} diff --git a/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclustersScoresFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclustersScoresFeatures.scala new file mode 100644 index 0000000000..0629636c07 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclustersScoresFeatures.scala @@ -0,0 +1,43 @@ +package com.twitter.timelines.prediction.features.simcluster + +import com.twitter.dal.personal_data.thriftjava.PersonalDataType.SemanticcoreClassification +import com.twitter.ml.api.Feature +import com.twitter.ml.api.Feature.Continuous +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion.CombineCountsBase +import scala.collection.JavaConverters._ + +object SimclustersScoresFeatures extends CombineCountsBase { + override def topK: Int = 2 + + override def hardLimit: Option[Int] = Some(20) + + val prefix = s"recommendations.sim_clusters_scores" + val TOPIC_CONSUMER_TWEET_EMBEDDING_Cs = new Continuous( + s"$prefix.localized_topic_consumer_tweet_embedding_cosine_similarity", + Set(SemanticcoreClassification).asJava) + val TOPIC_PRODUCER_TWEET_EMBEDDING_Cs = new Continuous( + s"$prefix.topic_producer_tweet_embedding_cosine_similarity", + Set(SemanticcoreClassification).asJava) + val USER_TOPIC_CONSUMER_TWEET_EMBEDDING_COSINE_SIM = new Continuous( + s"$prefix.user_interested_in_localized_topic_consumer_embedding_cosine_similarity", + Set(SemanticcoreClassification).asJava) + val USER_TOPIC_CONSUMER_TWEET_EMBEDDING_DOT_PRODUCT = new Continuous( + s"$prefix.user_interested_in_localized_topic_consumer_embedding_dot_product", + Set(SemanticcoreClassification).asJava) + val USER_TOPIC_PRODUCER_TWEET_EMBEDDING_COSINE_SIM = new Continuous( + s"$prefix.user_interested_in_localized_topic_producer_embedding_cosine_similarity", + Set(SemanticcoreClassification).asJava) + val USER_TOPIC_PRODUCER_TWEET_EMBEDDING_DOT_PRODUCT = new Continuous( + s"$prefix.user_interested_in_localized_topic_producer_embedding_dot_product", + Set(SemanticcoreClassification).asJava) + + override def precomputedCountFeatures: Seq[Feature[_]] = + Seq( + TOPIC_CONSUMER_TWEET_EMBEDDING_Cs, + TOPIC_PRODUCER_TWEET_EMBEDDING_Cs, + USER_TOPIC_CONSUMER_TWEET_EMBEDDING_COSINE_SIM, + USER_TOPIC_CONSUMER_TWEET_EMBEDDING_DOT_PRODUCT, + USER_TOPIC_PRODUCER_TWEET_EMBEDDING_COSINE_SIM, + USER_TOPIC_PRODUCER_TWEET_EMBEDDING_DOT_PRODUCT + ) +} diff --git a/src/scala/com/twitter/timelines/prediction/features/socialproof/BUILD b/src/scala/com/twitter/timelines/prediction/features/socialproof/BUILD new file mode 100644 index 0000000000..0c00b1e5b0 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/socialproof/BUILD @@ -0,0 +1,15 @@ +scala_library( + name = "socialproof_features", + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "3rdparty/jvm/com/ibm/icu:icu4j", + "src/java/com/twitter/ml/api:api-base", + "src/scala/com/twitter/ml/api/util", + "src/scala/com/twitter/timelines/util", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + "src/thrift/com/twitter/ml/api:data-java", + "src/thrift/com/twitter/timelines/socialproof:socialproof-scala", + ], +) diff --git a/src/scala/com/twitter/timelines/prediction/features/socialproof/SocialProofFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/socialproof/SocialProofFeatures.scala new file mode 100644 index 0000000000..163ba7efab --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/socialproof/SocialProofFeatures.scala @@ -0,0 +1,172 @@ +package com.twitter.timelines.prediction.features.socialproof + +import com.twitter.ml.api.DataRecord +import com.twitter.ml.api.Feature.Binary +import com.twitter.ml.api.Feature.Continuous +import com.twitter.ml.api.Feature.SparseBinary +import com.twitter.ml.api.util.FDsl._ +import com.twitter.timelines.prediction.features.socialproof.SocialProofDataRecordFeatures._ +import com.twitter.timelines.socialproof.thriftscala.SocialProof +import com.twitter.timelines.socialproof.v1.thriftscala.SocialProofType +import com.twitter.timelines.util.CommonTypes.UserId +import scala.collection.JavaConverters._ +import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ + +abstract class SocialProofUserGroundTruth(userIds: Seq[UserId], count: Int) { + require( + count >= userIds.size, + "count must be equal to or greater than the number of entries in userIds" + ) + // Using Double as the return type to make it more convenient for these values to be used as + // ML feature values. + val displayedUserCount: Double = userIds.size.toDouble + val undisplayedUserCount: Double = count - userIds.size.toDouble + val totalCount: Double = count.toDouble + + def featureDisplayedUsers: SparseBinary + def featureDisplayedUserCount: Continuous + def featureUndisplayedUserCount: Continuous + def featureTotalUserCount: Continuous + + def setFeatures(rec: DataRecord): Unit = { + rec.setFeatureValue(featureDisplayedUsers, toStringSet(userIds)) + rec.setFeatureValue(featureDisplayedUserCount, displayedUserCount) + rec.setFeatureValue(featureUndisplayedUserCount, undisplayedUserCount) + rec.setFeatureValue(featureTotalUserCount, totalCount) + } + protected def toStringSet(value: Seq[Long]): Set[String] = { + value.map(_.toString).toSet + } +} + +case class FavoritedBySocialProofUserGroundTruth(userIds: Seq[UserId] = Seq.empty, count: Int = 0) + extends SocialProofUserGroundTruth(userIds, count) { + + override val featureDisplayedUsers = SocialProofDisplayedFavoritedByUsers + override val featureDisplayedUserCount = SocialProofDisplayedFavoritedByUserCount + override val featureUndisplayedUserCount = SocialProofUndisplayedFavoritedByUserCount + override val featureTotalUserCount = SocialProofTotalFavoritedByUserCount +} + +case class RetweetedBySocialProofUserGroundTruth(userIds: Seq[UserId] = Seq.empty, count: Int = 0) + extends SocialProofUserGroundTruth(userIds, count) { + + override val featureDisplayedUsers = SocialProofDisplayedRetweetedByUsers + override val featureDisplayedUserCount = SocialProofDisplayedRetweetedByUserCount + override val featureUndisplayedUserCount = SocialProofUndisplayedRetweetedByUserCount + override val featureTotalUserCount = SocialProofTotalRetweetedByUserCount +} + +case class RepliedBySocialProofUserGroundTruth(userIds: Seq[UserId] = Seq.empty, count: Int = 0) + extends SocialProofUserGroundTruth(userIds, count) { + + override val featureDisplayedUsers = SocialProofDisplayedRepliedByUsers + override val featureDisplayedUserCount = SocialProofDisplayedRepliedByUserCount + override val featureUndisplayedUserCount = SocialProofUndisplayedRepliedByUserCount + override val featureTotalUserCount = SocialProofTotalRepliedByUserCount +} + +case class SocialProofFeatures( + hasSocialProof: Boolean, + favoritedBy: FavoritedBySocialProofUserGroundTruth = FavoritedBySocialProofUserGroundTruth(), + retweetedBy: RetweetedBySocialProofUserGroundTruth = RetweetedBySocialProofUserGroundTruth(), + repliedBy: RepliedBySocialProofUserGroundTruth = RepliedBySocialProofUserGroundTruth()) { + + def setFeatures(dataRecord: DataRecord): Unit = + if (hasSocialProof) { + dataRecord.setFeatureValue(HasSocialProof, hasSocialProof) + favoritedBy.setFeatures(dataRecord) + retweetedBy.setFeatures(dataRecord) + repliedBy.setFeatures(dataRecord) + } +} + +object SocialProofFeatures { + def apply(socialProofs: Seq[SocialProof]): SocialProofFeatures = + socialProofs.foldLeft(SocialProofFeatures(hasSocialProof = socialProofs.nonEmpty))( + (prevFeatures, socialProof) => { + val userIds = socialProof.v1.userIds + val count = socialProof.v1.count + socialProof.v1.socialProofType match { + case SocialProofType.FavoritedBy => + prevFeatures.copy(favoritedBy = FavoritedBySocialProofUserGroundTruth(userIds, count)) + case SocialProofType.RetweetedBy => + prevFeatures.copy(retweetedBy = RetweetedBySocialProofUserGroundTruth(userIds, count)) + case SocialProofType.RepliedBy => + prevFeatures.copy(repliedBy = RepliedBySocialProofUserGroundTruth(userIds, count)) + case _ => + prevFeatures // skip silently instead of breaking jobs, since this isn't used yet + } + }) +} + +object SocialProofDataRecordFeatures { + val HasSocialProof = new Binary("recap.social_proof.has_social_proof") + + val SocialProofDisplayedFavoritedByUsers = new SparseBinary( + "recap.social_proof.list.displayed.favorited_by", + Set(UserId, PublicLikes, PrivateLikes).asJava + ) + val SocialProofDisplayedFavoritedByUserCount = new Continuous( + "recap.social_proof.count.displayed.favorited_by", + Set(CountOfPrivateLikes, CountOfPublicLikes).asJava + ) + val SocialProofUndisplayedFavoritedByUserCount = new Continuous( + "recap.social_proof.count.undisplayed.favorited_by", + Set(CountOfPrivateLikes, CountOfPublicLikes).asJava + ) + val SocialProofTotalFavoritedByUserCount = new Continuous( + "recap.social_proof.count.total.favorited_by", + Set(CountOfPrivateLikes, CountOfPublicLikes).asJava + ) + + val SocialProofDisplayedRetweetedByUsers = new SparseBinary( + "recap.social_proof.list.displayed.retweeted_by", + Set(UserId, PublicRetweets, PrivateRetweets).asJava + ) + val SocialProofDisplayedRetweetedByUserCount = new Continuous( + "recap.social_proof.count.displayed.retweeted_by", + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava + ) + val SocialProofUndisplayedRetweetedByUserCount = new Continuous( + "recap.social_proof.count.undisplayed.retweeted_by", + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava + ) + val SocialProofTotalRetweetedByUserCount = new Continuous( + "recap.social_proof.count.total.retweeted_by", + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava + ) + + val SocialProofDisplayedRepliedByUsers = new SparseBinary( + "recap.social_proof.list.displayed.replied_by", + Set(UserId, PublicReplies, PrivateReplies).asJava + ) + val SocialProofDisplayedRepliedByUserCount = new Continuous( + "recap.social_proof.count.displayed.replied_by", + Set(CountOfPrivateReplies, CountOfPublicReplies).asJava + ) + val SocialProofUndisplayedRepliedByUserCount = new Continuous( + "recap.social_proof.count.undisplayed.replied_by", + Set(CountOfPrivateReplies, CountOfPublicReplies).asJava + ) + val SocialProofTotalRepliedByUserCount = new Continuous( + "recap.social_proof.count.total.replied_by", + Set(CountOfPrivateReplies, CountOfPublicReplies).asJava + ) + + val AllFeatures = Seq( + HasSocialProof, + SocialProofDisplayedFavoritedByUsers, + SocialProofDisplayedFavoritedByUserCount, + SocialProofUndisplayedFavoritedByUserCount, + SocialProofTotalFavoritedByUserCount, + SocialProofDisplayedRetweetedByUsers, + SocialProofDisplayedRetweetedByUserCount, + SocialProofUndisplayedRetweetedByUserCount, + SocialProofTotalRetweetedByUserCount, + SocialProofDisplayedRepliedByUsers, + SocialProofDisplayedRepliedByUserCount, + SocialProofUndisplayedRepliedByUserCount, + SocialProofTotalRepliedByUserCount + ) +} diff --git a/src/scala/com/twitter/timelines/prediction/features/time_features/BUILD b/src/scala/com/twitter/timelines/prediction/features/time_features/BUILD new file mode 100644 index 0000000000..b5c49af36d --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/time_features/BUILD @@ -0,0 +1,10 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "src/java/com/twitter/ml/api:api-base", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + "src/thrift/com/twitter/timelines/time_features:time_features-scala", + ], +) diff --git a/src/scala/com/twitter/timelines/prediction/features/time_features/TimeDataRecordFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/time_features/TimeDataRecordFeatures.scala new file mode 100644 index 0000000000..b398203c31 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/time_features/TimeDataRecordFeatures.scala @@ -0,0 +1,111 @@ +package com.twitter.timelines.prediction.features.time_features + +import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ +import com.twitter.ml.api.Feature._ +import scala.collection.JavaConverters._ +import com.twitter.util.Duration +import com.twitter.conversions.DurationOps._ + +object TimeDataRecordFeatures { + val TIME_BETWEEN_NON_POLLING_REQUESTS_AVG = new Continuous( + "time_features.time_between_non_polling_requests_avg", + Set(PrivateTimestamp).asJava + ) + val TIME_SINCE_TWEET_CREATION = new Continuous("time_features.time_since_tweet_creation") + val TIME_SINCE_SOURCE_TWEET_CREATION = new Continuous( + "time_features.time_since_source_tweet_creation" + ) + val TIME_SINCE_LAST_NON_POLLING_REQUEST = new Continuous( + "time_features.time_since_last_non_polling_request", + Set(PrivateTimestamp).asJava + ) + val NON_POLLING_REQUESTS_SINCE_TWEET_CREATION = new Continuous( + "time_features.non_polling_requests_since_tweet_creation", + Set(PrivateTimestamp).asJava + ) + val TWEET_AGE_RATIO = new Continuous("time_features.tweet_age_ratio") + val IS_TWEET_RECYCLED = new Binary("time_features.is_tweet_recycled") + // Last Engagement features + val LAST_FAVORITE_SINCE_CREATION_HRS = new Continuous( + "time_features.earlybird.last_favorite_since_creation_hrs", + Set(CountOfPrivateLikes, CountOfPublicLikes).asJava + ) + val LAST_RETWEET_SINCE_CREATION_HRS = new Continuous( + "time_features.earlybird.last_retweet_since_creation_hrs", + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava + ) + val LAST_REPLY_SINCE_CREATION_HRS = new Continuous( + "time_features.earlybird.last_reply_since_creation_hrs", + Set(CountOfPrivateReplies, CountOfPublicReplies).asJava + ) + val LAST_QUOTE_SINCE_CREATION_HRS = new Continuous( + "time_features.earlybird.last_quote_since_creation_hrs", + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava + ) + val TIME_SINCE_LAST_FAVORITE_HRS = new Continuous( + "time_features.earlybird.time_since_last_favorite", + Set(CountOfPrivateLikes, CountOfPublicLikes).asJava + ) + val TIME_SINCE_LAST_RETWEET_HRS = new Continuous( + "time_features.earlybird.time_since_last_retweet", + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava + ) + val TIME_SINCE_LAST_REPLY_HRS = new Continuous( + "time_features.earlybird.time_since_last_reply", + Set(CountOfPrivateReplies, CountOfPublicReplies).asJava + ) + val TIME_SINCE_LAST_QUOTE_HRS = new Continuous( + "time_features.earlybird.time_since_last_quote", + Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava + ) + + val TIME_SINCE_VIEWER_ACCOUNT_CREATION_SECS = + new Continuous( + "time_features.time_since_viewer_account_creation_secs", + Set(AccountCreationTime, AgeOfAccount).asJava) + + val USER_ID_IS_SNOWFLAKE_ID = + new Binary("time_features.time_user_id_is_snowflake_id", Set(UserType).asJava) + + val IS_30_DAY_NEW_USER = + new Binary("time_features.is_day_30_new_user", Set(AccountCreationTime, AgeOfAccount).asJava) + val IS_12_MONTH_NEW_USER = + new Binary("time_features.is_month_12_new_user", Set(AccountCreationTime, AgeOfAccount).asJava) + val ACCOUNT_AGE_INTERVAL = + new Discrete("time_features.account_age_interval", Set(AgeOfAccount).asJava) +} + +object AccountAgeInterval extends Enumeration { + val LTE_1_DAY, GT_1_DAY_LTE_5_DAY, GT_5_DAY_LTE_14_DAY, GT_14_DAY_LTE_30_DAY = Value + + def fromDuration(accountAge: Duration): Option[AccountAgeInterval.Value] = { + accountAge match { + case a if (a <= 1.day) => Some(LTE_1_DAY) + case a if (1.day < a && a <= 5.days) => Some(GT_1_DAY_LTE_5_DAY) + case a if (5.days < a && a <= 14.days) => Some(GT_5_DAY_LTE_14_DAY) + case a if (14.days < a && a <= 30.days) => Some(GT_14_DAY_LTE_30_DAY) + case _ => None + } + } +} + +case class TimeFeatures( + isTweetRecycled: Boolean, + timeSinceTweetCreation: Double, + isDay30NewUser: Boolean, + isMonth12NewUser: Boolean, + timeSinceSourceTweetCreation: Double, // same as timeSinceTweetCreation for non-retweets + timeSinceViewerAccountCreationSecs: Option[Double], + timeBetweenNonPollingRequestsAvg: Option[Double] = None, + timeSinceLastNonPollingRequest: Option[Double] = None, + nonPollingRequestsSinceTweetCreation: Option[Double] = None, + tweetAgeRatio: Option[Double] = None, + lastFavSinceCreationHrs: Option[Double] = None, + lastRetweetSinceCreationHrs: Option[Double] = None, + lastReplySinceCreationHrs: Option[Double] = None, + lastQuoteSinceCreationHrs: Option[Double] = None, + timeSinceLastFavoriteHrs: Option[Double] = None, + timeSinceLastRetweetHrs: Option[Double] = None, + timeSinceLastReplyHrs: Option[Double] = None, + timeSinceLastQuoteHrs: Option[Double] = None, + accountAgeInterval: Option[AccountAgeInterval.Value] = None) diff --git a/src/scala/com/twitter/timelines/prediction/features/two_hop_features/BUILD b/src/scala/com/twitter/timelines/prediction/features/two_hop_features/BUILD new file mode 100644 index 0000000000..a4ad0eabf3 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/two_hop_features/BUILD @@ -0,0 +1,10 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "graph-feature-service/src/main/thrift/com/twitter/graph_feature_service:graph_feature_service_thrift-scala", + "src/java/com/twitter/ml/api:api-base", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + ], +) diff --git a/src/scala/com/twitter/timelines/prediction/features/two_hop_features/TwoHopFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/two_hop_features/TwoHopFeatures.scala new file mode 100644 index 0000000000..03a1125784 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/two_hop_features/TwoHopFeatures.scala @@ -0,0 +1,93 @@ +package com.twitter.timelines.prediction.features.two_hop_features + +import com.twitter.graph_feature_service.thriftscala.EdgeType +import com.twitter.ml.api.Feature._ +import scala.collection.JavaConverters._ +import TwoHopFeaturesConfig.personalDataTypesMap + +object TwoHopFeaturesDescriptor { + val prefix = "two_hop" + val normalizedPostfix = "normalized" + val leftNodeDegreePostfix = "left_degree" + val rightNodeDegreePostfix = "right_degree" + + type TwoHopFeatureMap = Map[(EdgeType, EdgeType), Continuous] + type TwoHopFeatureNodeDegreeMap = Map[EdgeType, Continuous] + + def apply(edgeTypePairs: Seq[(EdgeType, EdgeType)]): TwoHopFeaturesDescriptor = { + new TwoHopFeaturesDescriptor(edgeTypePairs) + } +} + +class TwoHopFeaturesDescriptor(edgeTypePairs: Seq[(EdgeType, EdgeType)]) { + import TwoHopFeaturesDescriptor._ + + def getLeftEdge(edgeTypePair: (EdgeType, EdgeType)): EdgeType = { + edgeTypePair._1 + } + + def getLeftEdgeName(edgeTypePair: (EdgeType, EdgeType)): String = { + getLeftEdge(edgeTypePair).originalName.toLowerCase + } + + def getRightEdge(edgeTypePair: (EdgeType, EdgeType)): EdgeType = { + edgeTypePair._2 + } + + def getRightEdgeName(edgeTypePair: (EdgeType, EdgeType)): String = { + getRightEdge(edgeTypePair).originalName.toLowerCase + } + + val rawFeaturesMap: TwoHopFeatureMap = edgeTypePairs.map(edgeTypePair => { + val leftEdgeType = getLeftEdge(edgeTypePair) + val leftEdgeName = getLeftEdgeName(edgeTypePair) + val rightEdgeType = getRightEdge(edgeTypePair) + val rightEdgeName = getRightEdgeName(edgeTypePair) + val personalDataTypes = ( + personalDataTypesMap.getOrElse(leftEdgeType, Set.empty) ++ + personalDataTypesMap.getOrElse(rightEdgeType, Set.empty) + ).asJava + val rawFeature = new Continuous(s"$prefix.$leftEdgeName.$rightEdgeName", personalDataTypes) + edgeTypePair -> rawFeature + })(collection.breakOut) + + val leftNodeDegreeFeaturesMap: TwoHopFeatureNodeDegreeMap = edgeTypePairs.map(edgeTypePair => { + val leftEdgeType = getLeftEdge(edgeTypePair) + val leftEdgeName = getLeftEdgeName(edgeTypePair) + val personalDataTypes = personalDataTypesMap.getOrElse(leftEdgeType, Set.empty).asJava + val leftNodeDegreeFeature = + new Continuous(s"$prefix.$leftEdgeName.$leftNodeDegreePostfix", personalDataTypes) + leftEdgeType -> leftNodeDegreeFeature + })(collection.breakOut) + + val rightNodeDegreeFeaturesMap: TwoHopFeatureNodeDegreeMap = edgeTypePairs.map(edgeTypePair => { + val rightEdgeType = getRightEdge(edgeTypePair) + val rightEdgeName = getRightEdgeName(edgeTypePair) + val personalDataTypes = personalDataTypesMap.getOrElse(rightEdgeType, Set.empty).asJava + val rightNodeDegreeFeature = + new Continuous(s"$prefix.$rightEdgeName.$rightNodeDegreePostfix", personalDataTypes) + rightEdgeType -> rightNodeDegreeFeature + })(collection.breakOut) + + val normalizedFeaturesMap: TwoHopFeatureMap = edgeTypePairs.map(edgeTypePair => { + val leftEdgeType = getLeftEdge(edgeTypePair) + val leftEdgeName = getLeftEdgeName(edgeTypePair) + val rightEdgeType = getRightEdge(edgeTypePair) + val rightEdgeName = getRightEdgeName(edgeTypePair) + val personalDataTypes = ( + personalDataTypesMap.getOrElse(leftEdgeType, Set.empty) ++ + personalDataTypesMap.getOrElse(rightEdgeType, Set.empty) + ).asJava + val normalizedFeature = + new Continuous(s"$prefix.$leftEdgeName.$rightEdgeName.$normalizedPostfix", personalDataTypes) + edgeTypePair -> normalizedFeature + })(collection.breakOut) + + private val rawFeaturesSeq: Seq[Continuous] = rawFeaturesMap.values.toSeq + private val leftNodeDegreeFeaturesSeq: Seq[Continuous] = leftNodeDegreeFeaturesMap.values.toSeq + private val rightNodeDegreeFeaturesSeq: Seq[Continuous] = rightNodeDegreeFeaturesMap.values.toSeq + private val normalizedFeaturesSeq: Seq[Continuous] = normalizedFeaturesMap.values.toSeq + + val featuresSeq: Seq[Continuous] = + rawFeaturesSeq ++ leftNodeDegreeFeaturesSeq ++ rightNodeDegreeFeaturesSeq ++ normalizedFeaturesSeq +} diff --git a/src/scala/com/twitter/timelines/prediction/features/two_hop_features/TwoHopFeaturesConfig.scala b/src/scala/com/twitter/timelines/prediction/features/two_hop_features/TwoHopFeaturesConfig.scala new file mode 100644 index 0000000000..ece502e30c --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/two_hop_features/TwoHopFeaturesConfig.scala @@ -0,0 +1,30 @@ +package com.twitter.timelines.prediction.features.two_hop_features + +import com.twitter.dal.personal_data.thriftjava.PersonalDataType +import com.twitter.graph_feature_service.thriftscala.{EdgeType, FeatureType} + +object TwoHopFeaturesConfig { + val leftEdgeTypes = Seq(EdgeType.Following, EdgeType.Favorite, EdgeType.MutualFollow) + val rightEdgeTypes = Seq( + EdgeType.FollowedBy, + EdgeType.FavoritedBy, + EdgeType.RetweetedBy, + EdgeType.MentionedBy, + EdgeType.MutualFollow) + + val edgeTypePairs: Seq[(EdgeType, EdgeType)] = { + for (leftEdgeType <- leftEdgeTypes; rightEdgeType <- rightEdgeTypes) + yield (leftEdgeType, rightEdgeType) + } + + val featureTypes: Seq[FeatureType] = edgeTypePairs.map(pair => FeatureType(pair._1, pair._2)) + + val personalDataTypesMap: Map[EdgeType, Set[PersonalDataType]] = Map( + EdgeType.Following -> Set(PersonalDataType.CountOfFollowersAndFollowees), + EdgeType.Favorite -> Set( + PersonalDataType.CountOfPrivateLikes, + PersonalDataType.CountOfPublicLikes), + EdgeType.MutualFollow -> Set(PersonalDataType.CountOfFollowersAndFollowees), + EdgeType.FollowedBy -> Set(PersonalDataType.CountOfFollowersAndFollowees) + ) +} diff --git a/src/scala/com/twitter/timelines/prediction/features/user_health/BUILD b/src/scala/com/twitter/timelines/prediction/features/user_health/BUILD new file mode 100644 index 0000000000..598e0c0668 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/user_health/BUILD @@ -0,0 +1,10 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "src/java/com/twitter/ml/api:api-base", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + "src/thrift/com/twitter/timelines/author_features/user_health:thrift-scala", + ], +) diff --git a/src/scala/com/twitter/timelines/prediction/features/user_health/UserHealthFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/user_health/UserHealthFeatures.scala new file mode 100644 index 0000000000..7c8c7f8b16 --- /dev/null +++ b/src/scala/com/twitter/timelines/prediction/features/user_health/UserHealthFeatures.scala @@ -0,0 +1,23 @@ +package com.twitter.timelines.prediction.features.user_health + +import com.twitter.ml.api.Feature +import com.twitter.timelines.author_features.user_health.thriftscala.UserState +import com.twitter.dal.personal_data.thriftjava.PersonalDataType.{UserState => UserStatePDT} +import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ +import scala.collection.JavaConverters._ + +object UserHealthFeatures { + val UserState = new Feature.Discrete("user_health.user_state", Set(UserStatePDT, UserType).asJava) + val IsLightMinusUser = + new Feature.Binary("user_health.is_light_minus_user", Set(UserStatePDT, UserType).asJava) + val AuthorState = + new Feature.Discrete("user_health.author_state", Set(UserStatePDT, UserType).asJava) + val NumAuthorFollowers = + new Feature.Continuous("author_health.num_followers", Set(CountOfFollowersAndFollowees).asJava) + val NumAuthorConnectDays = new Feature.Continuous("author_health.num_connect_days") + val NumAuthorConnect = new Feature.Continuous("author_health.num_connect") + + val IsUserVerifiedUnion = new Feature.Binary("user_account.is_user_verified_union") +} + +case class UserHealthFeatures(id: Long, userStateOpt: Option[UserState]) diff --git a/timelines/data_processing/ml_util/aggregation_framework/AggregateGroup.scala b/timelines/data_processing/ml_util/aggregation_framework/AggregateGroup.scala new file mode 100644 index 0000000000..6797d838a6 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/AggregateGroup.scala @@ -0,0 +1,124 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework + +import com.twitter.ml.api._ +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetric +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.EasyMetric +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.MaxMetric +import com.twitter.timelines.data_processing.ml_util.transforms.OneToSomeTransform +import com.twitter.util.Duration +import java.lang.{Boolean => JBoolean} +import java.lang.{Long => JLong} +import scala.language.existentials + +/** + * A wrapper for [[com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup]] + * (see TypedAggregateGroup.scala) with some convenient syntactic sugar that avoids + * the user having to specify different groups for different types of features. + * Gets translated into multiple strongly typed TypedAggregateGroup(s) + * by the buildTypedAggregateGroups() method defined below. + * + * @param inputSource Source to compute this aggregate over + * @param preTransforms Sequence of [[ITransform]] that is applied to + * data records pre-aggregation (e.g. discretization, renaming) + * @param samplingTransformOpt Optional [[OneToSomeTransform]] that samples data record + * @param aggregatePrefix Prefix to use for naming resultant aggregate features + * @param keys Features to group by when computing the aggregates + * (e.g. USER_ID, AUTHOR_ID). These must be either discrete, string or sparse binary. + * Grouping by a sparse binary feature is different than grouping by a discrete or string + * feature. For example, if you have a sparse binary feature WORDS_IN_TWEET which is + * a set of all words in a tweet, then grouping by this feature generates a + * separate aggregate mean/count/etc for each value of the feature (each word), and + * not just a single aggregate count for different "sets of words" + * @param features Features to aggregate (e.g. blender_score or is_photo). + * @param labels Labels to cross the features with to make pair features, if any. + * @param metrics Aggregation metrics to compute (e.g. count, mean) + * @param halfLives Half lives to use for the aggregations, to be crossed with the above. + * use Duration.Top for "forever" aggregations over an infinite time window (no decay). + * @param outputStore Store to output this aggregate to + * @param includeAnyFeature Aggregate label counts for any feature value + * @param includeAnyLabel Aggregate feature counts for any label value (e.g. all impressions) + * @param includeTimestampFeature compute max aggregate on timestamp feature + * @param aggExclusionRegex Sequence of Regexes, which define features to + */ +case class AggregateGroup( + inputSource: AggregateSource, + aggregatePrefix: String, + keys: Set[Feature[_]], + features: Set[Feature[_]], + labels: Set[_ <: Feature[JBoolean]], + metrics: Set[EasyMetric], + halfLives: Set[Duration], + outputStore: AggregateStore, + preTransforms: Seq[OneToSomeTransform] = Seq.empty, + includeAnyFeature: Boolean = true, + includeAnyLabel: Boolean = true, + includeTimestampFeature: Boolean = false, + aggExclusionRegex: Seq[String] = Seq.empty) { + + private def toStrongType[T]( + metrics: Set[EasyMetric], + features: Set[Feature[_]], + featureType: FeatureType + ): TypedAggregateGroup[_] = { + val underlyingMetrics: Set[AggregationMetric[T, _]] = + metrics.flatMap(_.forFeatureType[T](featureType)) + val underlyingFeatures: Set[Feature[T]] = features + .map(_.asInstanceOf[Feature[T]]) + + TypedAggregateGroup[T]( + inputSource = inputSource, + aggregatePrefix = aggregatePrefix, + keysToAggregate = keys, + featuresToAggregate = underlyingFeatures, + labels = labels, + metrics = underlyingMetrics, + halfLives = halfLives, + outputStore = outputStore, + preTransforms = preTransforms, + includeAnyFeature, + includeAnyLabel, + aggExclusionRegex + ) + } + + private def timestampTypedAggregateGroup: TypedAggregateGroup[_] = { + val metrics: Set[AggregationMetric[JLong, _]] = + Set(MaxMetric.forFeatureType[JLong](TypedAggregateGroup.timestampFeature.getFeatureType).get) + + TypedAggregateGroup[JLong]( + inputSource = inputSource, + aggregatePrefix = aggregatePrefix, + keysToAggregate = keys, + featuresToAggregate = Set(TypedAggregateGroup.timestampFeature), + labels = Set.empty, + metrics = metrics, + halfLives = Set(Duration.Top), + outputStore = outputStore, + preTransforms = preTransforms, + includeAnyFeature = false, + includeAnyLabel = true, + aggExclusionRegex = Seq.empty + ) + } + + def buildTypedAggregateGroups(): List[TypedAggregateGroup[_]] = { + val typedAggregateGroupsList = { + if (features.isEmpty) { + List(toStrongType(metrics, features, FeatureType.BINARY)) + } else { + features + .groupBy(_.getFeatureType()) + .toList + .map { + case (featureType, features) => + toStrongType(metrics, features, featureType) + } + } + } + + val optionalTimestampTypedAggregateGroup = + if (includeTimestampFeature) List(timestampTypedAggregateGroup) else List() + + typedAggregateGroupsList ++ optionalTimestampTypedAggregateGroup + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/AggregateSource.scala b/timelines/data_processing/ml_util/aggregation_framework/AggregateSource.scala new file mode 100644 index 0000000000..7fb239c65c --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/AggregateSource.scala @@ -0,0 +1,9 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework + +import com.twitter.ml.api.Feature +import java.lang.{Long => JLong} + +trait AggregateSource extends Serializable { + def name: String + def timestampFeature: Feature[JLong] +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/AggregateStore.scala b/timelines/data_processing/ml_util/aggregation_framework/AggregateStore.scala new file mode 100644 index 0000000000..1c09b33f01 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/AggregateStore.scala @@ -0,0 +1,5 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework + +trait AggregateStore extends Serializable { + def name: String +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/AggregationConfig.scala b/timelines/data_processing/ml_util/aggregation_framework/AggregationConfig.scala new file mode 100644 index 0000000000..2b117ddbd1 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/AggregationConfig.scala @@ -0,0 +1,5 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework + +trait AggregationConfig { + def aggregatesToCompute: Set[TypedAggregateGroup[_]] +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/AggregationKey.scala b/timelines/data_processing/ml_util/aggregation_framework/AggregationKey.scala new file mode 100644 index 0000000000..c3aafef692 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/AggregationKey.scala @@ -0,0 +1,50 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework + +import com.twitter.bijection.Bufferable +import com.twitter.bijection.Injection +import scala.util.Try + +/** + * Case class that represents the "grouping" key for any aggregate feature. + * Used by Summingbird to output aggregates to the key-value "store" using sumByKey() + * + * @discreteFeaturesById All discrete featureids (+ values) that are part of this key + * @textFeaturesById All string featureids (+ values) that are part of this key + * + * Example 1: the user aggregate features in aggregatesv1 all group by USER_ID, + * which is a discrete feature. When storing these features, the key would be: + * + * discreteFeaturesById = Map(hash(USER_ID) -> ), textFeaturesById = Map() + * + * Ex 2: If aggregating grouped by USER_ID, AUTHOR_ID, tweet link url, the key would be: + * + * discreteFeaturesById = Map(hash(USER_ID) -> , hash(AUTHOR_ID) -> ), + * textFeaturesById = Map(hash(URL_FEATURE) -> ) + * + * I could have just used a DataRecord for the key, but I wanted to make it strongly typed + * and only support grouping by discrete and string features, so using a case class instead. + * + * Re: efficiency, storing the hash of the feature in addition to just the feature value + * is somewhat more inefficient than only storing the feature value in the key, but it + * adds flexibility to group multiple types of aggregates in the same output store. If we + * decide this isn't a good tradeoff to make later, we can reverse/refactor this decision. + */ +case class AggregationKey( + discreteFeaturesById: Map[Long, Long], + textFeaturesById: Map[Long, String]) + +/** + * A custom injection for the above case class, + * so that Summingbird knows how to store it in Manhattan. + */ +object AggregationKeyInjection extends Injection[AggregationKey, Array[Byte]] { + /* Injection from tuple representation of AggregationKey to Array[Byte] */ + val featureMapsInjection: Injection[(Map[Long, Long], Map[Long, String]), Array[Byte]] = + Bufferable.injectionOf[(Map[Long, Long], Map[Long, String])] + + def apply(aggregationKey: AggregationKey): Array[Byte] = + featureMapsInjection(AggregationKey.unapply(aggregationKey).get) + + def invert(ab: Array[Byte]): Try[AggregationKey] = + featureMapsInjection.invert(ab).map(AggregationKey.tupled(_)) +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/BUILD b/timelines/data_processing/ml_util/aggregation_framework/BUILD new file mode 100644 index 0000000000..aff4881168 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/BUILD @@ -0,0 +1,101 @@ +scala_library( + name = "common_types", + sources = ["*.scala"], + platform = "java8", + strict_deps = True, + tags = ["bazel-compatible"], + dependencies = [ + "3rdparty/jvm/com/google/guava", + "3rdparty/jvm/com/twitter/algebird:bijection", + "3rdparty/jvm/com/twitter/algebird:core", + "3rdparty/jvm/com/twitter/algebird:util", + "3rdparty/jvm/com/twitter/bijection:core", + "3rdparty/jvm/com/twitter/bijection:json", + "3rdparty/jvm/com/twitter/bijection:macros", + "3rdparty/jvm/com/twitter/bijection:netty", + "3rdparty/jvm/com/twitter/bijection:scrooge", + "3rdparty/jvm/com/twitter/bijection:thrift", + "3rdparty/jvm/com/twitter/bijection:util", + "3rdparty/jvm/org/apache/thrift:libthrift", + "3rdparty/src/jvm/com/twitter/scalding:date", + "3rdparty/src/jvm/com/twitter/summingbird:batch", + "src/java/com/twitter/ml/api:api-base", + "src/java/com/twitter/ml/api/constant", + "src/scala/com/twitter/dal/client/dataset", + "src/scala/com/twitter/ml/api/util:datarecord", + "src/scala/com/twitter/scalding_internal/dalv2/vkvs", + "src/scala/com/twitter/scalding_internal/multiformat/format/keyval", + "src/scala/com/twitter/storehaus_internal/manhattan/config", + "src/scala/com/twitter/storehaus_internal/offline", + "src/scala/com/twitter/storehaus_internal/util", + "src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits", + "src/scala/com/twitter/summingbird_internal/runner/store_config", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + "src/thrift/com/twitter/dal/personal_data:personal_data-scala", + "src/thrift/com/twitter/ml/api:data-java", + "timelines/data_processing/ml_util/aggregation_framework/metrics", + "timelines/data_processing/ml_util/transforms", + "util/util-core:util-core-util", + ], +) + +target( + name = "common_online_stores", + dependencies = [ + "src/scala/com/twitter/storehaus_internal/memcache", + ], +) + +target( + name = "common_offline_stores", + dependencies = [ + "src/scala/com/twitter/storehaus_internal/manhattan", + ], +) + +target( + name = "user_job", + dependencies = [ + "timelines/data_processing/ml_util/aggregation_framework/job", + ], +) + +target( + name = "scalding", + dependencies = [ + "timelines/data_processing/ml_util/aggregation_framework/scalding", + ], +) + +target( + name = "conversion", + dependencies = [ + "timelines/data_processing/ml_util/aggregation_framework/conversion", + ], +) + +target( + name = "query", + dependencies = [ + "timelines/data_processing/ml_util/aggregation_framework/query", + ], +) + +target( + name = "heron", + dependencies = [ + "timelines/data_processing/ml_util/aggregation_framework/heron", + ], +) + +target( + dependencies = [ + ":common_offline_stores", + ":common_online_stores", + ":common_types", + ":conversion", + ":heron", + ":query", + ":scalding", + ], +) diff --git a/timelines/data_processing/ml_util/aggregation_framework/DataRecordAggregationMonoid.scala b/timelines/data_processing/ml_util/aggregation_framework/DataRecordAggregationMonoid.scala new file mode 100644 index 0000000000..bc37c8e05b --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/DataRecordAggregationMonoid.scala @@ -0,0 +1,92 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework + +import com.twitter.algebird.Monoid +import com.twitter.ml.api._ +import com.twitter.ml.api.constant.SharedFeatures +import com.twitter.ml.api.util.SRichDataRecord +import scala.collection.mutable +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon._ + +/** + * Monoid to aggregate over DataRecord objects. + * + * @param aggregates Set of ''TypedAggregateGroup'' case classes* + * to compute using this monoid (see TypedAggregateGroup.scala) + */ +trait DataRecordMonoid extends Monoid[DataRecord] { + + val aggregates: Set[TypedAggregateGroup[_]] + + def zero(): DataRecord = new DataRecord + + /* + * Add two datarecords using this monoid. + * + * @param left Left datarecord to add + * @param right Right datarecord to add + * @return Sum of the two datarecords as a DataRecord + */ + def plus(left: DataRecord, right: DataRecord): DataRecord = { + val result = zero() + aggregates.foreach(_.mutatePlus(result, left, right)) + val leftTimestamp = getTimestamp(left) + val rightTimestamp = getTimestamp(right) + SRichDataRecord(result).setFeatureValue( + SharedFeatures.TIMESTAMP, + leftTimestamp.max(rightTimestamp) + ) + result + } +} + +case class DataRecordAggregationMonoid(aggregates: Set[TypedAggregateGroup[_]]) + extends DataRecordMonoid { + + private def sumBuffer(buffer: mutable.ArrayBuffer[DataRecord]): Unit = { + val bufferSum = zero() + buffer.toIterator.foreach { value => + val leftTimestamp = getTimestamp(bufferSum) + val rightTimestamp = getTimestamp(value) + aggregates.foreach(_.mutatePlus(bufferSum, bufferSum, value)) + SRichDataRecord(bufferSum).setFeatureValue( + SharedFeatures.TIMESTAMP, + leftTimestamp.max(rightTimestamp) + ) + } + + buffer.clear() + buffer += bufferSum + } + + /* + * Efficient batched aggregation of datarecords using + * this monoid + a buffer, for performance. + * + * @param dataRecordIter An iterator of datarecords to sum + * @return A datarecord option containing the sum + */ + override def sumOption(dataRecordIter: TraversableOnce[DataRecord]): Option[DataRecord] = { + if (dataRecordIter.isEmpty) { + None + } else { + var buffer = mutable.ArrayBuffer[DataRecord]() + val BatchSize = 1000 + + dataRecordIter.foreach { u => + if (buffer.size > BatchSize) sumBuffer(buffer) + buffer += u + } + + if (buffer.size > 1) sumBuffer(buffer) + Some(buffer(0)) + } + } +} + +/* + * This class is used when there is no need to use sumBuffer functionality, as in the case of + * online aggregation of datarecords where using a buffer on a small number of datarecords + * would add some performance overhead. + */ +case class DataRecordAggregationMonoidNoBuffer(aggregates: Set[TypedAggregateGroup[_]]) + extends DataRecordMonoid {} diff --git a/timelines/data_processing/ml_util/aggregation_framework/KeyedRecord.scala b/timelines/data_processing/ml_util/aggregation_framework/KeyedRecord.scala new file mode 100644 index 0000000000..bb3096767f --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/KeyedRecord.scala @@ -0,0 +1,27 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework + +import com.twitter.ml.api.DataRecord + +/** + * Keyed record that is used to reprsent the aggregation type and its corresponding data record. + * + * @constructor creates a new keyed record. + * + * @param aggregateType the aggregate type + * @param record the data record associated with the key + **/ +case class KeyedRecord(aggregateType: AggregateType.Value, record: DataRecord) + +/** + * Keyed record map with multiple data record. + * + * @constructor creates a new keyed record map. + * + * @param aggregateType the aggregate type + * @param recordMap a map with key of type Long and value of type DataRecord + * where the key indicates the index and the value indicating the record + * + **/ +case class KeyedRecordMap( + aggregateType: AggregateType.Value, + recordMap: scala.collection.Map[Long, DataRecord]) diff --git a/timelines/data_processing/ml_util/aggregation_framework/OfflineAggregateInjections.scala b/timelines/data_processing/ml_util/aggregation_framework/OfflineAggregateInjections.scala new file mode 100644 index 0000000000..7ab1233c19 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/OfflineAggregateInjections.scala @@ -0,0 +1,46 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework + +import com.twitter.dal.personal_data.thriftscala.PersonalDataType +import com.twitter.ml.api.DataRecord +import com.twitter.ml.api.Feature +import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection +import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Batched +import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.JavaCompactThrift +import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.genericInjection +import com.twitter.summingbird.batch.BatchID +import scala.collection.JavaConverters._ + +object OfflineAggregateInjections { + val offlineDataRecordAggregateInjection: KeyValInjection[AggregationKey, (BatchID, DataRecord)] = + KeyValInjection( + genericInjection(AggregationKeyInjection), + Batched(JavaCompactThrift[DataRecord]) + ) + + private[aggregation_framework] def getPdts[T]( + aggregateGroups: Iterable[T], + featureExtractor: T => Iterable[Feature[_]] + ): Option[Set[PersonalDataType]] = { + val pdts: Set[PersonalDataType] = for { + group <- aggregateGroups.toSet[T] + feature <- featureExtractor(group) + pdtSet <- feature.getPersonalDataTypes.asSet().asScala + javaPdt <- pdtSet.asScala + scalaPdt <- PersonalDataType.get(javaPdt.getValue) + } yield { + scalaPdt + } + if (pdts.nonEmpty) Some(pdts) else None + } + + def getInjection( + aggregateGroups: Set[TypedAggregateGroup[_]] + ): KeyValInjection[AggregationKey, (BatchID, DataRecord)] = { + val keyPdts = getPdts[TypedAggregateGroup[_]](aggregateGroups, _.allOutputKeys) + val valuePdts = getPdts[TypedAggregateGroup[_]](aggregateGroups, _.allOutputFeatures) + KeyValInjection( + genericInjection(AggregationKeyInjection, keyPdts), + genericInjection(Batched(JavaCompactThrift[DataRecord]), valuePdts) + ) + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/OfflineAggregateSource.scala b/timelines/data_processing/ml_util/aggregation_framework/OfflineAggregateSource.scala new file mode 100644 index 0000000000..116f553c46 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/OfflineAggregateSource.scala @@ -0,0 +1,21 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework + +import com.twitter.dal.client.dataset.TimePartitionedDALDataset +import com.twitter.ml.api.DataRecord +import com.twitter.ml.api.Feature +import java.lang.{Long => JLong} + +case class OfflineAggregateSource( + override val name: String, + override val timestampFeature: Feature[JLong], + scaldingHdfsPath: Option[String] = None, + scaldingSuffixType: Option[String] = None, + dalDataSet: Option[TimePartitionedDALDataset[DataRecord]] = None, + withValidation: Boolean = true) // context: https://jira.twitter.biz/browse/TQ-10618 + extends AggregateSource { + /* + * Th help transition callers to use DAL.read, we check that either the HDFS + * path is defined, or the dalDataset. Both options cannot be set at the same time. + */ + assert(!(scaldingHdfsPath.isDefined && dalDataSet.isDefined)) +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/OfflineAggregateStore.scala b/timelines/data_processing/ml_util/aggregation_framework/OfflineAggregateStore.scala new file mode 100644 index 0000000000..0bba08a948 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/OfflineAggregateStore.scala @@ -0,0 +1,128 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework + +import com.twitter.dal.client.dataset.KeyValDALDataset +import com.twitter.ml.api.DataRecord +import com.twitter.scalding.DateParser +import com.twitter.scalding.RichDate +import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal +import com.twitter.storehaus_internal.manhattan._ +import com.twitter.storehaus_internal.util.ApplicationID +import com.twitter.storehaus_internal.util.DatasetName +import com.twitter.storehaus_internal.util.HDFSPath +import com.twitter.summingbird.batch.BatchID +import com.twitter.summingbird.batch.Batcher +import com.twitter.summingbird_internal.runner.store_config._ +import java.util.TimeZone +import com.twitter.summingbird.batch.MillisecondBatcher + +/* + * Configuration common to all offline aggregate stores + * + * @param outputHdfsPathPrefix HDFS prefix to store all output aggregate types offline + * @param dummyAppId Dummy manhattan app id required by summingbird (unused) + * @param dummyDatasetPrefix Dummy manhattan dataset prefix required by summingbird (unused) + * @param startDate Start date for summingbird job to begin computing aggregates + */ +case class OfflineAggregateStoreCommonConfig( + outputHdfsPathPrefix: String, + dummyAppId: String, + dummyDatasetPrefix: String, + startDate: String) + +/** + * A trait inherited by any object that defines + * a HDFS prefix to write output data to. E.g. timelines has its own + * output prefix to write aggregates_v2 results, your team can create + * its own. + */ +trait OfflineStoreCommonConfig extends Serializable { + /* + * @param startDate Date to create config for + * @return OfflineAggregateStoreCommonConfig object with all config details for output populated + */ + def apply(startDate: String): OfflineAggregateStoreCommonConfig +} + +/** + * @param name Uniquely identifiable human-readable name for this output store + * @param startDate Start date for this output store from which aggregates should be computed + * @param commonConfig Provider of other common configuration details + * @param batchesToKeep Retention policy on output (number of batches to keep) + */ +abstract class OfflineAggregateStoreBase + extends OfflineStoreOnlyConfig[ManhattanROConfig] + with AggregateStore { + + override def name: String + def startDate: String + def commonConfig: OfflineStoreCommonConfig + def batchesToKeep: Int + def maxKvSourceFailures: Int + + val datedCommonConfig: OfflineAggregateStoreCommonConfig = commonConfig.apply(startDate) + val manhattan: ManhattanROConfig = ManhattanROConfig( + /* This is a sample config, will be replaced with production config later */ + HDFSPath(s"${datedCommonConfig.outputHdfsPathPrefix}/${name}"), + ApplicationID(datedCommonConfig.dummyAppId), + DatasetName(s"${datedCommonConfig.dummyDatasetPrefix}_${name}_1"), + com.twitter.storehaus_internal.manhattan.Adama + ) + + val batcherSize = 24 + val batcher: MillisecondBatcher = Batcher.ofHours(batcherSize) + + val startTime: RichDate = + RichDate(datedCommonConfig.startDate)(TimeZone.getTimeZone("UTC"), DateParser.default) + + val offline: ManhattanROConfig = manhattan +} + +/** + * Defines an aggregates store which is composed of DataRecords + * @param name Uniquely identifiable human-readable name for this output store + * @param startDate Start date for this output store from which aggregates should be computed + * @param commonConfig Provider of other common configuration details + * @param batchesToKeep Retention policy on output (number of batches to keep) + */ +case class OfflineAggregateDataRecordStore( + override val name: String, + override val startDate: String, + override val commonConfig: OfflineStoreCommonConfig, + override val batchesToKeep: Int = 7, + override val maxKvSourceFailures: Int = 0) + extends OfflineAggregateStoreBase { + + def toOfflineAggregateDataRecordStoreWithDAL( + dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]] + ): OfflineAggregateDataRecordStoreWithDAL = + OfflineAggregateDataRecordStoreWithDAL( + name = name, + startDate = startDate, + commonConfig = commonConfig, + dalDataset = dalDataset, + maxKvSourceFailures = maxKvSourceFailures + ) +} + +trait withDALDataset { + def dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]] +} + +/** + * Defines an aggregates store which is composed of DataRecords and writes using DAL. + * @param name Uniquely identifiable human-readable name for this output store + * @param startDate Start date for this output store from which aggregates should be computed + * @param commonConfig Provider of other common configuration details + * @param dalDataset The KeyValDALDataset for this output store + * @param batchesToKeep Unused, kept for interface compatibility. You must define a separate Oxpecker + * retention policy to maintain the desired number of versions. + */ +case class OfflineAggregateDataRecordStoreWithDAL( + override val name: String, + override val startDate: String, + override val commonConfig: OfflineStoreCommonConfig, + override val dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]], + override val batchesToKeep: Int = -1, + override val maxKvSourceFailures: Int = 0) + extends OfflineAggregateStoreBase + with withDALDataset diff --git a/timelines/data_processing/ml_util/aggregation_framework/README.md b/timelines/data_processing/ml_util/aggregation_framework/README.md new file mode 100644 index 0000000000..ea9a4b4469 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/README.md @@ -0,0 +1,39 @@ +Overview +======== + + +The **aggregation framework** is a set of libraries and utilities that allows teams to flexibly +compute aggregate (counting) features in both batch and in real-time. Aggregate features can capture +historical interactions between on arbitrary entities (and sets thereof), conditional on provided features +and labels. + +These types of engineered aggregate features have proven to be highly impactful across different teams at Twitter. + + +What are some features we can compute? +-------------------------------------- + +The framework supports computing aggregate features on provided grouping keys. The only constraint is that these keys are sparse binary features (or are sets thereof). + +For example, a common use case is to calculate a user's past engagement history with various types of tweets (photo, video, retweets, etc.), specific authors, specific in-network engagers or any other entity the user has interacted with and that could provide signal. In this case, the underlying aggregation keys are `userId`, `(userId, authorId)` or `(userId, engagerId)`. + +In Timelines and MagicRecs, we also compute custom aggregate engagement counts on every `tweetId`. Similary, other aggregations are possible, perhaps on `advertiserId` or `mediaId` as long as the grouping key is sparse binary. + + +What implementations are supported? +----------------------------------- + +Offline, we support the daily batch processing of DataRecords containing all required input features to generate +aggregate features. These are then uploaded to Manhattan for online hydration. + +Online, we support the real-time aggregation of DataRecords through Storm with a backing memcache that can be queried +for the real-time aggregate features. + +Additional documentation exists in the [docs folder](docs) + + +Where is this used? +-------------------- + +The Home Timeline heavy ranker uses a varierty of both [batch and real time features](../../../../src/scala/com/twitter/timelines/prediction/common/aggregates/README.md) generated by this framework. +These features are also used for email and other recommendations. \ No newline at end of file diff --git a/timelines/data_processing/ml_util/aggregation_framework/StoreConfig.scala b/timelines/data_processing/ml_util/aggregation_framework/StoreConfig.scala new file mode 100644 index 0000000000..703d5893c5 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/StoreConfig.scala @@ -0,0 +1,68 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework + +import com.twitter.ml.api.constant.SharedFeatures +import com.twitter.ml.api.Feature +import com.twitter.ml.api.FeatureType + +/** + * Convenience class to describe the stores that make up a particular type of aggregate. + * + * For example, as of 2018/07, user aggregates are generate by merging the individual + * "user_aggregates", "rectweet_user_aggregates", and, "twitter_wide_user_aggregates". + * + * @param storeNames Name of the stores. + * @param aggregateType Type of aggregate, usually differentiated by the aggregation key. + * @param shouldHash Used at TimelineRankingAggregatesUtil.extractSecondary when extracting the + * secondary key value. + */ +case class StoreConfig[T]( + storeNames: Set[String], + aggregateType: AggregateType.Value, + shouldHash: Boolean = false +)( + implicit storeMerger: StoreMerger) { + require(storeMerger.isValidToMerge(storeNames)) + + private val representativeStore = storeNames.head + + val aggregationKeyIds: Set[Long] = storeMerger.getAggregateKeys(representativeStore) + val aggregationKeyFeatures: Set[Feature[_]] = + storeMerger.getAggregateKeyFeatures(representativeStore) + val secondaryKeyFeatureOpt: Option[Feature[_]] = storeMerger.getSecondaryKey(representativeStore) +} + +trait StoreMerger { + def aggregationConfig: AggregationConfig + + def getAggregateKeyFeatures(storeName: String): Set[Feature[_]] = + aggregationConfig.aggregatesToCompute + .filter(_.outputStore.name == storeName) + .flatMap(_.keysToAggregate) + + def getAggregateKeys(storeName: String): Set[Long] = + TypedAggregateGroup.getKeyFeatureIds(getAggregateKeyFeatures(storeName)) + + def getSecondaryKey(storeName: String): Option[Feature[_]] = { + val keys = getAggregateKeyFeatures(storeName) + require(keys.size <= 2, "Only singleton or binary aggregation keys are supported.") + require(keys.contains(SharedFeatures.USER_ID), "USER_ID must be one of the aggregation keys.") + keys + .filterNot(_ == SharedFeatures.USER_ID) + .headOption + .map { possiblySparseKey => + if (possiblySparseKey.getFeatureType != FeatureType.SPARSE_BINARY) { + possiblySparseKey + } else { + TypedAggregateGroup.sparseFeature(possiblySparseKey) + } + } + } + + /** + * Stores may only be merged if they have the same aggregation key. + */ + def isValidToMerge(storeNames: Set[String]): Boolean = { + val expectedKeyOpt = storeNames.headOption.map(getAggregateKeys) + storeNames.forall(v => getAggregateKeys(v) == expectedKeyOpt.get) + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/StoreRegister.scala b/timelines/data_processing/ml_util/aggregation_framework/StoreRegister.scala new file mode 100644 index 0000000000..a7e9cd5357 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/StoreRegister.scala @@ -0,0 +1,13 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework + +trait StoreRegister { + def allStores: Set[StoreConfig[_]] + + lazy val storeMap: Map[AggregateType.Value, StoreConfig[_]] = allStores + .map(store => (store.aggregateType, store)) + .toMap + + lazy val storeNameToTypeMap: Map[String, AggregateType.Value] = allStores + .flatMap(store => store.storeNames.map(name => (name, store.aggregateType))) + .toMap +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/TypedAggregateGroup.scala b/timelines/data_processing/ml_util/aggregation_framework/TypedAggregateGroup.scala new file mode 100644 index 0000000000..92afc41374 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/TypedAggregateGroup.scala @@ -0,0 +1,486 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework + +import com.twitter.ml.api._ +import com.twitter.ml.api.constant.SharedFeatures +import com.twitter.ml.api.util.SRichDataRecord +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregateFeature +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetric +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon._ +import com.twitter.timelines.data_processing.ml_util.transforms.OneToSomeTransform +import com.twitter.util.Duration +import com.twitter.util.Try +import java.lang.{Boolean => JBoolean} +import java.lang.{Double => JDouble} +import java.lang.{Long => JLong} +import java.util.{Set => JSet} +import scala.annotation.tailrec +import scala.language.existentials +import scala.collection.JavaConverters._ +import scala.util.matching.Regex + +/** + * A case class contained precomputed data useful to quickly + * process operations over an aggregate. + * + * @param query The underlying feature being aggregated + * @param metric The aggregation metric + * @param outputFeatures The output features that aggregation will produce + * @param outputFeatureIds The precomputed hashes of the above outputFeatures + */ +case class PrecomputedAggregateDescriptor[T]( + query: AggregateFeature[T], + metric: AggregationMetric[T, _], + outputFeatures: List[Feature[_]], + outputFeatureIds: List[JLong]) + +object TypedAggregateGroup { + + /** + * Recursive function that generates all combinations of value + * assignments for a collection of sparse binary features. + * + * @param sparseBinaryIdValues list of sparse binary feature ids and possible values they can take + * @return A set of maps, where each map represents one possible assignment of values to ids + */ + def sparseBinaryPermutations( + sparseBinaryIdValues: List[(Long, Set[String])] + ): Set[Map[Long, String]] = sparseBinaryIdValues match { + case (id, values) +: rest => + tailRecSparseBinaryPermutations( + existingPermutations = values.map(value => Map(id -> value)), + remainingIdValues = rest + ) + case Nil => Set.empty + } + + @tailrec private[this] def tailRecSparseBinaryPermutations( + existingPermutations: Set[Map[Long, String]], + remainingIdValues: List[(Long, Set[String])] + ): Set[Map[Long, String]] = remainingIdValues match { + case Nil => existingPermutations + case (id, values) +: rest => + tailRecSparseBinaryPermutations( + existingPermutations.flatMap { existingIdValueMap => + values.map(value => existingIdValueMap ++ Map(id -> value)) + }, + rest + ) + } + + val SparseFeatureSuffix = ".member" + def sparseFeature(sparseBinaryFeature: Feature[_]): Feature[String] = + new Feature.Text( + sparseBinaryFeature.getDenseFeatureName + SparseFeatureSuffix, + AggregationMetricCommon.derivePersonalDataTypes(Some(sparseBinaryFeature))) + + /* Throws exception if obj not an instance of U */ + private[this] def validate[U](obj: Any): U = { + require(obj.isInstanceOf[U]) + obj.asInstanceOf[U] + } + + private[this] def getFeatureOpt[U](dataRecord: DataRecord, feature: Feature[U]): Option[U] = + Option(SRichDataRecord(dataRecord).getFeatureValue(feature)).map(validate[U](_)) + + /** + * Get a mapping from feature ids + * (including individual sparse elements of a sparse feature) to values + * from the given data record, for a given feature type. + * + * @param dataRecord Data record to get features from + * @param keysToAggregate key features to get id-value mappings for + * @param featureType Feature type to get id-value maps for + */ + def getKeyFeatureIdValues[U]( + dataRecord: DataRecord, + keysToAggregate: Set[Feature[_]], + featureType: FeatureType + ): Set[(Long, Option[U])] = { + val featuresOfThisType: Set[Feature[U]] = keysToAggregate + .filter(_.getFeatureType == featureType) + .map(validate[Feature[U]]) + + featuresOfThisType + .map { feature: Feature[U] => + val featureId: Long = getDenseFeatureId(feature) + val featureOpt: Option[U] = getFeatureOpt(dataRecord, feature) + (featureId, featureOpt) + } + } + + // TypedAggregateGroup may transform the aggregate keys for internal use. This method generates + // denseFeatureIds for the transformed feature. + def getDenseFeatureId(feature: Feature[_]): Long = + if (feature.getFeatureType != FeatureType.SPARSE_BINARY) { + feature.getDenseFeatureId + } else { + sparseFeature(feature).getDenseFeatureId + } + + /** + * Return denseFeatureIds for the input features after applying the custom transformation that + * TypedAggregateGroup applies to its keysToAggregate. + * + * @param keysToAggregate key features to get id for + */ + def getKeyFeatureIds(keysToAggregate: Set[Feature[_]]): Set[Long] = + keysToAggregate.map(getDenseFeatureId) + + def checkIfAllKeysExist[U](featureIdValueMap: Map[Long, Option[U]]): Boolean = + featureIdValueMap.forall { case (_, valueOpt) => valueOpt.isDefined } + + def liftOptions[U](featureIdValueMap: Map[Long, Option[U]]): Map[Long, U] = + featureIdValueMap + .flatMap { + case (id, valueOpt) => + valueOpt.map { value => (id, value) } + } + + val timestampFeature: Feature[JLong] = SharedFeatures.TIMESTAMP + + /** + * Builds all valid aggregation keys (for the output store) from + * a datarecord and a spec listing the keys to aggregate. There + * can be multiple aggregation keys generated from a single data + * record when grouping by sparse binary features, for which multiple + * values can be set within the data record. + * + * @param dataRecord Data record to read values for key features from + * @return A set of AggregationKeys encoding the values of all keys + */ + def buildAggregationKeys( + dataRecord: DataRecord, + keysToAggregate: Set[Feature[_]] + ): Set[AggregationKey] = { + val discreteAggregationKeys = getKeyFeatureIdValues[Long]( + dataRecord, + keysToAggregate, + FeatureType.DISCRETE + ).toMap + + val textAggregationKeys = getKeyFeatureIdValues[String]( + dataRecord, + keysToAggregate, + FeatureType.STRING + ).toMap + + val sparseBinaryIdValues = getKeyFeatureIdValues[JSet[String]]( + dataRecord, + keysToAggregate, + FeatureType.SPARSE_BINARY + ).map { + case (id, values) => + ( + id, + values + .map(_.asScala.toSet) + .getOrElse(Set.empty[String]) + ) + }.toList + + if (checkIfAllKeysExist(discreteAggregationKeys) && + checkIfAllKeysExist(textAggregationKeys)) { + if (sparseBinaryIdValues.nonEmpty) { + sparseBinaryPermutations(sparseBinaryIdValues).map { sparseBinaryTextKeys => + AggregationKey( + discreteFeaturesById = liftOptions(discreteAggregationKeys), + textFeaturesById = liftOptions(textAggregationKeys) ++ sparseBinaryTextKeys + ) + } + } else { + Set( + AggregationKey( + discreteFeaturesById = liftOptions(discreteAggregationKeys), + textFeaturesById = liftOptions(textAggregationKeys) + ) + ) + } + } else Set.empty[AggregationKey] + } + +} + +/** + * Specifies one or more related aggregate(s) to compute in the summingbird job. + * + * @param inputSource Source to compute this aggregate over + * @param preTransforms Sequence of [[com.twitter.ml.api.RichITransform]] that transform + * data records pre-aggregation (e.g. discretization, renaming) + * @param samplingTransformOpt Optional [[OneToSomeTransform]] that transform data + * record to optional data record (e.g. for sampling) before aggregation + * @param aggregatePrefix Prefix to use for naming resultant aggregate features + * @param keysToAggregate Features to group by when computing the aggregates + * (e.g. USER_ID, AUTHOR_ID) + * @param featuresToAggregate Features to aggregate (e.g. blender_score or is_photo) + * @param labels Labels to cross the features with to make pair features, if any. + * use Label.All if you don't want to cross with a label. + * @param metrics Aggregation metrics to compute (e.g. count, mean) + * @param halfLives Half lives to use for the aggregations, to be crossed with the above. + * use Duration.Top for "forever" aggregations over an infinite time window (no decay). + * @param outputStore Store to output this aggregate to + * @param includeAnyFeature Aggregate label counts for any feature value + * @param includeAnyLabel Aggregate feature counts for any label value (e.g. all impressions) + * + * The overall config for the summingbird job consists of a list of "AggregateGroup" + * case class objects, which get translated into strongly typed "TypedAggregateGroup" + * case class objects. A single TypedAggregateGroup always groups input data records from + * ''inputSource'' by a single set of aggregation keys (''featuresToAggregate''). + * Within these groups, we perform a comprehensive cross of: + * + * ''featuresToAggregate'' x ''labels'' x ''metrics'' x ''halfLives'' + * + * All the resultant aggregate features are assigned a human-readable feature name + * beginning with ''aggregatePrefix'', and are written to DataRecords that get + * aggregated and written to the store specified by ''outputStore''. + * + * Illustrative example. Suppose we define our spec as follows: + * + * TypedAggregateGroup( + * inputSource = "timelines_recap_daily", + * aggregatePrefix = "user_author_aggregate", + * keysToAggregate = Set(USER_ID, AUTHOR_ID), + * featuresToAggregate = Set(RecapFeatures.TEXT_SCORE, RecapFeatures.BLENDER_SCORE), + * labels = Set(RecapFeatures.IS_FAVORITED, RecapFeatures.IS_REPLIED), + * metrics = Set(CountMetric, MeanMetric), + * halfLives = Set(7.Days, 30.Days), + * outputStore = "user_author_aggregate_store" + * ) + * + * This will process data records from the source named "timelines_recap_daily" + * (see AggregateSource.scala for more details on how to add your own source) + * It will produce a total of 2x2x2x2 = 16 aggregation features, named like: + * + * user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.count.7days + * user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.count.30days + * user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.mean.7days + * + * ... (and so on) + * + * and all the result features will be stored in DataRecords, summed up, and written + * to the output store defined by the name "user_author_aggregate_store". + * (see AggregateStore.scala for details on how to add your own store). + * + * If you do not want a full cross, split up your config into multiple TypedAggregateGroup + * objects. Splitting is strongly advised to avoid blowing up and creating invalid + * or unnecessary combinations of aggregate features (note that some combinations + * are useless or invalid e.g. computing the mean of a binary feature). Splitting + * also does not cost anything in terms of real-time performance, because all + * Aggregate objects in the master spec that share the same ''keysToAggregate'', the + * same ''inputSource'' and the same ''outputStore'' are grouped by the summingbird + * job logic and stored into a single DataRecord in the output store. Overlapping + * aggregates will also automatically be deduplicated so don't worry about overlaps. + */ +case class TypedAggregateGroup[T]( + inputSource: AggregateSource, + aggregatePrefix: String, + keysToAggregate: Set[Feature[_]], + featuresToAggregate: Set[Feature[T]], + labels: Set[_ <: Feature[JBoolean]], + metrics: Set[AggregationMetric[T, _]], + halfLives: Set[Duration], + outputStore: AggregateStore, + preTransforms: Seq[OneToSomeTransform] = Seq.empty, + includeAnyFeature: Boolean = true, + includeAnyLabel: Boolean = true, + aggExclusionRegex: Seq[String] = Seq.empty) { + import TypedAggregateGroup._ + + val compiledRegexes = aggExclusionRegex.map(new Regex(_)) + + // true if should drop, false if should keep + def filterOutAggregateFeature( + feature: PrecomputedAggregateDescriptor[_], + regexes: Seq[Regex] + ): Boolean = { + if (regexes.nonEmpty) + feature.outputFeatures.exists { feature => + regexes.exists { re => re.findFirstMatchIn(feature.getDenseFeatureName).nonEmpty } + } + else false + } + + def buildAggregationKeys( + dataRecord: DataRecord + ): Set[AggregationKey] = { + TypedAggregateGroup.buildAggregationKeys(dataRecord, keysToAggregate) + } + + /** + * This val precomputes descriptors for all individual aggregates in this group + * (of type ''AggregateFeature''). Also precompute hashes of all aggregation + * "output" features generated by these operators for faster + * run-time performance (this turns out to be a primary CPU bottleneck). + * Ex: for the mean operator, "sum" and "count" are output features + */ + val individualAggregateDescriptors: Set[PrecomputedAggregateDescriptor[T]] = { + /* + * By default, in additional to all feature-label crosses, also + * compute in aggregates over each feature and label without crossing + */ + val labelOptions = labels.map(Option(_)) ++ + (if (includeAnyLabel) Set(None) else Set.empty) + val featureOptions = featuresToAggregate.map(Option(_)) ++ + (if (includeAnyFeature) Set(None) else Set.empty) + for { + feature <- featureOptions + label <- labelOptions + metric <- metrics + halfLife <- halfLives + } yield { + val query = AggregateFeature[T](aggregatePrefix, feature, label, halfLife) + + val aggregateOutputFeatures = metric.getOutputFeatures(query) + val aggregateOutputFeatureIds = metric.getOutputFeatureIds(query) + PrecomputedAggregateDescriptor( + query, + metric, + aggregateOutputFeatures, + aggregateOutputFeatureIds + ) + } + }.filterNot(filterOutAggregateFeature(_, compiledRegexes)) + + /* Precomputes a map from all generated aggregate feature ids to their half lives. */ + val continuousFeatureIdsToHalfLives: Map[Long, Duration] = + individualAggregateDescriptors.flatMap { descriptor => + descriptor.outputFeatures + .flatMap { feature => + if (feature.getFeatureType() == FeatureType.CONTINUOUS) { + Try(feature.asInstanceOf[Feature[JDouble]]).toOption + .map(feature => (feature.getFeatureId(), descriptor.query.halfLife)) + } else None + } + }.toMap + + /* + * Sparse binary keys become individual string keys in the output. + * e.g. group by "words.in.tweet", output key: "words.in.tweet.member" + */ + val allOutputKeys: Set[Feature[_]] = keysToAggregate.map { key => + if (key.getFeatureType == FeatureType.SPARSE_BINARY) sparseFeature(key) + else key + } + + val allOutputFeatures: Set[Feature[_]] = individualAggregateDescriptors.flatMap { + case PrecomputedAggregateDescriptor( + query, + metric, + outputFeatures, + outputFeatureIds + ) => + outputFeatures + } + + val aggregateContext: FeatureContext = new FeatureContext(allOutputFeatures.toList.asJava) + + /** + * Adds all aggregates in this group found in the two input data records + * into a result, mutating the result. Uses a while loop for an + * approximately 10% gain in speed over a for comprehension. + * + * WARNING: mutates ''result'' + * + * @param result The output data record to mutate + * @param left The left data record to add + * @param right The right data record to add + */ + def mutatePlus(result: DataRecord, left: DataRecord, right: DataRecord): Unit = { + val featureIterator = individualAggregateDescriptors.iterator + while (featureIterator.hasNext) { + val descriptor = featureIterator.next + descriptor.metric.mutatePlus( + result, + left, + right, + descriptor.query, + Some(descriptor.outputFeatureIds) + ) + } + } + + /** + * Apply preTransforms sequentially. If any transform results in a dropped (None) + * DataRecord, then entire tranform sequence will result in a dropped DataRecord. + * Note that preTransforms are order-dependent. + */ + private[this] def sequentiallyTransform(dataRecord: DataRecord): Option[DataRecord] = { + val recordOpt = Option(new DataRecord(dataRecord)) + preTransforms.foldLeft(recordOpt) { + case (Some(previousRecord), preTransform) => + preTransform(previousRecord) + case _ => Option.empty[DataRecord] + } + } + + /** + * Given a data record, apply transforms and fetch the incremental contributions to + * each configured aggregate from this data record, and store these in an output data record. + * + * @param dataRecord Input data record to aggregate. + * @return A set of tuples (AggregationKey, DataRecord) whose first entry is an + * AggregationKey indicating what keys we're grouping by, and whose second entry + * is an output data record with incremental contributions to the aggregate value(s) + */ + def computeAggregateKVPairs(dataRecord: DataRecord): Set[(AggregationKey, DataRecord)] = { + sequentiallyTransform(dataRecord) + .flatMap { dataRecord => + val aggregationKeys = buildAggregationKeys(dataRecord) + val increment = new DataRecord + + val isNonEmptyIncrement = individualAggregateDescriptors + .map { descriptor => + descriptor.metric.setIncrement( + output = increment, + input = dataRecord, + query = descriptor.query, + timestampFeature = inputSource.timestampFeature, + aggregateOutputs = Some(descriptor.outputFeatureIds) + ) + } + .exists(identity) + + if (isNonEmptyIncrement) { + SRichDataRecord(increment).setFeatureValue( + timestampFeature, + getTimestamp(dataRecord, inputSource.timestampFeature) + ) + Some(aggregationKeys.map(key => (key, increment))) + } else { + None + } + } + .getOrElse(Set.empty[(AggregationKey, DataRecord)]) + } + + def outputFeaturesToRenamedOutputFeatures(prefix: String): Map[Feature[_], Feature[_]] = { + require(prefix.nonEmpty) + + allOutputFeatures.map { feature => + if (feature.isSetFeatureName) { + val renamedFeatureName = prefix + feature.getDenseFeatureName + val personalDataTypes = + if (feature.getPersonalDataTypes.isPresent) feature.getPersonalDataTypes.get() + else null + + val renamedFeature = feature.getFeatureType match { + case FeatureType.BINARY => + new Feature.Binary(renamedFeatureName, personalDataTypes) + case FeatureType.DISCRETE => + new Feature.Discrete(renamedFeatureName, personalDataTypes) + case FeatureType.STRING => + new Feature.Text(renamedFeatureName, personalDataTypes) + case FeatureType.CONTINUOUS => + new Feature.Continuous(renamedFeatureName, personalDataTypes) + case FeatureType.SPARSE_BINARY => + new Feature.SparseBinary(renamedFeatureName, personalDataTypes) + case FeatureType.SPARSE_CONTINUOUS => + new Feature.SparseContinuous(renamedFeatureName, personalDataTypes) + } + feature -> renamedFeature + } else { + feature -> feature + } + }.toMap + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/Utils.scala b/timelines/data_processing/ml_util/aggregation_framework/Utils.scala new file mode 100644 index 0000000000..60196fc625 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/Utils.scala @@ -0,0 +1,122 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework + +import com.twitter.algebird.ScMapMonoid +import com.twitter.algebird.Semigroup +import com.twitter.ml.api._ +import com.twitter.ml.api.constant.SharedFeatures +import com.twitter.ml.api.DataRecord +import com.twitter.ml.api.Feature +import com.twitter.ml.api.FeatureType +import com.twitter.ml.api.util.SRichDataRecord +import java.lang.{Long => JLong} +import scala.collection.{Map => ScMap} + +object Utils { + val dataRecordMerger: DataRecordMerger = new DataRecordMerger + def EmptyDataRecord: DataRecord = new DataRecord() + + private val random = scala.util.Random + private val keyedDataRecordMapMonoid = { + val dataRecordMergerSg = new Semigroup[DataRecord] { + override def plus(x: DataRecord, y: DataRecord): DataRecord = { + dataRecordMerger.merge(x, y) + x + } + } + new ScMapMonoid[Long, DataRecord]()(dataRecordMergerSg) + } + + def keyFromLong(record: DataRecord, feature: Feature[JLong]): Long = + SRichDataRecord(record).getFeatureValue(feature).longValue + + def keyFromString(record: DataRecord, feature: Feature[String]): Long = + try { + SRichDataRecord(record).getFeatureValue(feature).toLong + } catch { + case _: NumberFormatException => 0L + } + + def keyFromHash(record: DataRecord, feature: Feature[String]): Long = + SRichDataRecord(record).getFeatureValue(feature).hashCode.toLong + + def extractSecondary[T]( + record: DataRecord, + secondaryKey: Feature[T], + shouldHash: Boolean = false + ): Long = secondaryKey.getFeatureType match { + case FeatureType.STRING => + if (shouldHash) keyFromHash(record, secondaryKey.asInstanceOf[Feature[String]]) + else keyFromString(record, secondaryKey.asInstanceOf[Feature[String]]) + case FeatureType.DISCRETE => keyFromLong(record, secondaryKey.asInstanceOf[Feature[JLong]]) + case f => throw new IllegalArgumentException(s"Feature type $f is not supported.") + } + + def mergeKeyedRecordOpts(args: Option[KeyedRecord]*): Option[KeyedRecord] = { + val keyedRecords = args.flatten + if (keyedRecords.isEmpty) { + None + } else { + val keys = keyedRecords.map(_.aggregateType) + require(keys.toSet.size == 1, "All merged records must have the same aggregate key.") + val mergedRecord = mergeRecords(keyedRecords.map(_.record): _*) + Some(KeyedRecord(keys.head, mergedRecord)) + } + } + + private def mergeRecords(args: DataRecord*): DataRecord = + if (args.isEmpty) EmptyDataRecord + else { + // can just do foldLeft(new DataRecord) for both cases, but try reusing the EmptyDataRecord singleton as much as possible + args.tail.foldLeft(args.head) { (merged, record) => + dataRecordMerger.merge(merged, record) + merged + } + } + + def mergeKeyedRecordMapOpts( + opt1: Option[KeyedRecordMap], + opt2: Option[KeyedRecordMap], + maxSize: Int = Int.MaxValue + ): Option[KeyedRecordMap] = { + if (opt1.isEmpty && opt2.isEmpty) { + None + } else { + val keys = Seq(opt1, opt2).flatten.map(_.aggregateType) + require(keys.toSet.size == 1, "All merged records must have the same aggregate key.") + val mergedRecordMap = mergeMapOpts(opt1.map(_.recordMap), opt2.map(_.recordMap), maxSize) + Some(KeyedRecordMap(keys.head, mergedRecordMap)) + } + } + + private def mergeMapOpts( + opt1: Option[ScMap[Long, DataRecord]], + opt2: Option[ScMap[Long, DataRecord]], + maxSize: Int = Int.MaxValue + ): ScMap[Long, DataRecord] = { + require(maxSize >= 0) + val keySet = opt1.map(_.keySet).getOrElse(Set.empty) ++ opt2.map(_.keySet).getOrElse(Set.empty) + val totalSize = keySet.size + val rate = if (totalSize <= maxSize) 1.0 else maxSize.toDouble / totalSize + val prunedOpt1 = opt1.map(downsample(_, rate)) + val prunedOpt2 = opt2.map(downsample(_, rate)) + Seq(prunedOpt1, prunedOpt2).flatten + .foldLeft(keyedDataRecordMapMonoid.zero)(keyedDataRecordMapMonoid.plus) + } + + def downsample[K, T](m: ScMap[K, T], samplingRate: Double): ScMap[K, T] = { + if (samplingRate >= 1.0) { + m + } else if (samplingRate <= 0) { + Map.empty + } else { + m.filter { + case (key, _) => + // It is important that the same user with the same sampling rate be deterministically + // selected or rejected. Otherwise, mergeMapOpts will choose different keys for the + // two input maps and their union will be larger than the limit we want. + random.setSeed((key.hashCode, samplingRate.hashCode).hashCode) + random.nextDouble < samplingRate + } + } + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/conversion/AggregatesV2Adapter.scala b/timelines/data_processing/ml_util/aggregation_framework/conversion/AggregatesV2Adapter.scala new file mode 100644 index 0000000000..f5b7d1814e --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/conversion/AggregatesV2Adapter.scala @@ -0,0 +1,165 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion + +import com.twitter.algebird.DecayedValue +import com.twitter.algebird.DecayedValueMonoid +import com.twitter.algebird.Monoid +import com.twitter.ml.api._ +import com.twitter.ml.api.constant.SharedFeatures +import com.twitter.ml.api.util.FDsl._ +import com.twitter.ml.api.util.SRichDataRecord +import com.twitter.summingbird.batch.BatchID +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregateFeature +import com.twitter.util.Duration +import java.lang.{Double => JDouble} +import java.lang.{Long => JLong} +import scala.collection.JavaConverters._ +import scala.collection.mutable +import java.{util => ju} + +object AggregatesV2Adapter { + type AggregatesV2Tuple = (AggregationKey, (BatchID, DataRecord)) + + val Epsilon: Double = 1e-6 + val decayedValueMonoid: Monoid[DecayedValue] = DecayedValueMonoid(Epsilon) + + /* + * Decays the storedValue from timestamp -> sourceVersion + * + * @param storedValue value read from the aggregates v2 output store + * @param timestamp timestamp corresponding to store value + * @param sourceVersion timestamp of version to decay all values to uniformly + * @param halfLife Half life duration to use for applying decay + * + * By applying this function, the feature values for all users are decayed + * to sourceVersion. This is important to ensure that a user whose aggregates + * were updated long in the past does not have an artifically inflated count + * compared to one whose aggregates were updated (and hence decayed) more recently. + */ + def decayValueToSourceVersion( + storedValue: Double, + timestamp: Long, + sourceVersion: Long, + halfLife: Duration + ): Double = + if (timestamp > sourceVersion) { + storedValue + } else { + decayedValueMonoid + .plus( + DecayedValue.build(storedValue, timestamp, halfLife.inMilliseconds), + DecayedValue.build(0, sourceVersion, halfLife.inMilliseconds) + ) + .value + } + + /* + * Decays all the aggregate features occurring in the ''inputRecord'' + * to a given timestamp, and mutates the ''outputRecord'' accordingly. + * Note that inputRecord and outputRecord can be the same if you want + * to mutate the input in place, the function does this correctly. + * + * @param inputRecord Input record to get features from + * @param aggregates Aggregates to decay + * @param decayTo Timestamp to decay to + * @param trimThreshold Drop features below this trim threshold + * @param outputRecord Output record to mutate + * @return the mutated outputRecord + */ + def mutateDecay( + inputRecord: DataRecord, + aggregateFeaturesAndHalfLives: List[(Feature[_], Duration)], + decayTo: Long, + trimThreshold: Double, + outputRecord: DataRecord + ): DataRecord = { + val timestamp = inputRecord.getFeatureValue(SharedFeatures.TIMESTAMP).toLong + + aggregateFeaturesAndHalfLives.foreach { + case (aggregateFeature: Feature[_], halfLife: Duration) => + if (aggregateFeature.getFeatureType() == FeatureType.CONTINUOUS) { + val continuousFeature = aggregateFeature.asInstanceOf[Feature[JDouble]] + if (inputRecord.hasFeature(continuousFeature)) { + val storedValue = inputRecord.getFeatureValue(continuousFeature).toDouble + val decayedValue = decayValueToSourceVersion(storedValue, timestamp, decayTo, halfLife) + if (math.abs(decayedValue) > trimThreshold) { + outputRecord.setFeatureValue(continuousFeature, decayedValue) + } + } + } + } + + /* Update timestamp to version (now that we've decayed all aggregates) */ + outputRecord.setFeatureValue(SharedFeatures.TIMESTAMP, decayTo) + + outputRecord + } +} + +class AggregatesV2Adapter( + aggregates: Set[TypedAggregateGroup[_]], + sourceVersion: Long, + trimThreshold: Double) + extends IRecordOneToManyAdapter[AggregatesV2Adapter.AggregatesV2Tuple] { + + import AggregatesV2Adapter._ + + val keyFeatures: List[Feature[_]] = aggregates.flatMap(_.allOutputKeys).toList + val aggregateFeatures: List[Feature[_]] = aggregates.flatMap(_.allOutputFeatures).toList + val timestampFeatures: List[Feature[JLong]] = List(SharedFeatures.TIMESTAMP) + val allFeatures: List[Feature[_]] = keyFeatures ++ aggregateFeatures ++ timestampFeatures + + val featureContext: FeatureContext = new FeatureContext(allFeatures.asJava) + + override def getFeatureContext: FeatureContext = featureContext + + val aggregateFeaturesAndHalfLives: List[(Feature[_$3], Duration) forSome { type _$3 }] = + aggregateFeatures.map { aggregateFeature: Feature[_] => + val halfLife = AggregateFeature.parseHalfLife(aggregateFeature) + (aggregateFeature, halfLife) + } + + override def adaptToDataRecords(tuple: AggregatesV2Tuple): ju.List[DataRecord] = tuple match { + case (key: AggregationKey, (batchId: BatchID, record: DataRecord)) => { + val resultRecord = new SRichDataRecord(new DataRecord, featureContext) + + val itr = resultRecord.continuousFeaturesIterator() + val featuresToClear = mutable.Set[Feature[JDouble]]() + while (itr.moveNext()) { + val nextFeature = itr.getFeature + if (!aggregateFeatures.contains(nextFeature)) { + featuresToClear += nextFeature + } + } + + featuresToClear.foreach(resultRecord.clearFeature) + + keyFeatures.foreach { keyFeature: Feature[_] => + if (keyFeature.getFeatureType == FeatureType.DISCRETE) { + resultRecord.setFeatureValue( + keyFeature.asInstanceOf[Feature[JLong]], + key.discreteFeaturesById(keyFeature.getDenseFeatureId) + ) + } else if (keyFeature.getFeatureType == FeatureType.STRING) { + resultRecord.setFeatureValue( + keyFeature.asInstanceOf[Feature[String]], + key.textFeaturesById(keyFeature.getDenseFeatureId) + ) + } + } + + if (record.hasFeature(SharedFeatures.TIMESTAMP)) { + mutateDecay( + record, + aggregateFeaturesAndHalfLives, + sourceVersion, + trimThreshold, + resultRecord) + List(resultRecord.getRecord).asJava + } else { + List.empty[DataRecord].asJava + } + } + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/conversion/AggregatesV2FeatureSource.scala b/timelines/data_processing/ml_util/aggregation_framework/conversion/AggregatesV2FeatureSource.scala new file mode 100644 index 0000000000..5e196a43e4 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/conversion/AggregatesV2FeatureSource.scala @@ -0,0 +1,171 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion + +import com.twitter.bijection.Injection +import com.twitter.bijection.thrift.CompactThriftCodec +import com.twitter.ml.api.AdaptedFeatureSource +import com.twitter.ml.api.DataRecord +import com.twitter.ml.api.IRecordOneToManyAdapter +import com.twitter.ml.api.TypedFeatureSource +import com.twitter.scalding.DateRange +import com.twitter.scalding.RichDate +import com.twitter.scalding.TypedPipe +import com.twitter.scalding.commons.source.VersionedKeyValSource +import com.twitter.scalding.commons.tap.VersionedTap.TapMode +import com.twitter.summingbird.batch.BatchID +import com.twitter.summingbird_internal.bijection.BatchPairImplicits +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKeyInjection +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup +import org.apache.hadoop.mapred.JobConf +import scala.collection.JavaConverters._ +import AggregatesV2Adapter._ + +object AggregatesV2AdaptedSource { + val DefaultTrimThreshold = 0 +} + +trait AggregatesV2AdaptedSource extends AggregatesV2AdaptedSourceBase[DataRecord] { + override def storageFormatCodec: Injection[DataRecord, Array[Byte]] = + CompactThriftCodec[DataRecord] + override def toDataRecord(v: DataRecord): DataRecord = v +} + +trait AggregatesV2AdaptedSourceBase[StorageFormat] + extends TypedFeatureSource[AggregatesV2Tuple] + with AdaptedFeatureSource[AggregatesV2Tuple] + with BatchPairImplicits { + + /* Output root path of aggregates v2 job, excluding store name and version */ + def rootPath: String + + /* Name of store under root path to read */ + def storeName: String + + // max bijection failures + def maxFailures: Int = 0 + + /* Aggregate config used to generate above output */ + def aggregates: Set[TypedAggregateGroup[_]] + + /* trimThreshold Trim all aggregates below a certain threshold to save memory */ + def trimThreshold: Double + + def toDataRecord(v: StorageFormat): DataRecord + + def sourceVersionOpt: Option[Long] + + def enableMostRecentBeforeSourceVersion: Boolean = false + + implicit private val aggregationKeyInjection: Injection[AggregationKey, Array[Byte]] = + AggregationKeyInjection + implicit def storageFormatCodec: Injection[StorageFormat, Array[Byte]] + + private def filteredAggregates = aggregates.filter(_.outputStore.name == storeName) + def storePath: String = List(rootPath, storeName).mkString("/") + + def mostRecentVkvs: VersionedKeyValSource[_, _] = { + VersionedKeyValSource[AggregationKey, (BatchID, StorageFormat)]( + path = storePath, + sourceVersion = None, + maxFailures = maxFailures + ) + } + + private def availableVersions: Seq[Long] = + mostRecentVkvs + .getTap(TapMode.SOURCE) + .getStore(new JobConf(true)) + .getAllVersions() + .asScala + .map(_.toLong) + + private def mostRecentVersion: Long = { + require(!availableVersions.isEmpty, s"$storeName has no available versions") + availableVersions.max + } + + def versionToUse: Long = + if (enableMostRecentBeforeSourceVersion) { + sourceVersionOpt + .map(sourceVersion => + availableVersions.filter(_ <= sourceVersion) match { + case Seq() => + throw new IllegalArgumentException( + "No version older than version: %s, available versions: %s" + .format(sourceVersion, availableVersions) + ) + case versionList => versionList.max + }) + .getOrElse(mostRecentVersion) + } else { + sourceVersionOpt.getOrElse(mostRecentVersion) + } + + override lazy val adapter: IRecordOneToManyAdapter[AggregatesV2Tuple] = + new AggregatesV2Adapter(filteredAggregates, versionToUse, trimThreshold) + + override def getData: TypedPipe[AggregatesV2Tuple] = { + val vkvsToUse: VersionedKeyValSource[AggregationKey, (BatchID, StorageFormat)] = { + VersionedKeyValSource[AggregationKey, (BatchID, StorageFormat)]( + path = storePath, + sourceVersion = Some(versionToUse), + maxFailures = maxFailures + ) + } + TypedPipe.from(vkvsToUse).map { + case (key, (batch, value)) => (key, (batch, toDataRecord(value))) + } + } +} + +/* + * Adapted data record feature source from aggregates v2 manhattan output + * Params documented in parent trait. + */ +case class AggregatesV2FeatureSource( + override val rootPath: String, + override val storeName: String, + override val aggregates: Set[TypedAggregateGroup[_]], + override val trimThreshold: Double = 0, + override val maxFailures: Int = 0, +)( + implicit val dateRange: DateRange) + extends AggregatesV2AdaptedSource { + + // Increment end date by 1 millisec since summingbird output for date D is stored at (D+1)T00 + override val sourceVersionOpt: Some[Long] = Some(dateRange.end.timestamp + 1) +} + +/* + * Reads most recent available AggregatesV2FeatureSource. + * There is no constraint on recency. + * Params documented in parent trait. + */ +case class AggregatesV2MostRecentFeatureSource( + override val rootPath: String, + override val storeName: String, + override val aggregates: Set[TypedAggregateGroup[_]], + override val trimThreshold: Double = AggregatesV2AdaptedSource.DefaultTrimThreshold, + override val maxFailures: Int = 0) + extends AggregatesV2AdaptedSource { + + override val sourceVersionOpt: None.type = None +} + +/* + * Reads most recent available AggregatesV2FeatureSource + * on or before the specified beforeDate. + * Params documented in parent trait. + */ +case class AggregatesV2MostRecentFeatureSourceBeforeDate( + override val rootPath: String, + override val storeName: String, + override val aggregates: Set[TypedAggregateGroup[_]], + override val trimThreshold: Double = AggregatesV2AdaptedSource.DefaultTrimThreshold, + beforeDate: RichDate, + override val maxFailures: Int = 0) + extends AggregatesV2AdaptedSource { + + override val enableMostRecentBeforeSourceVersion = true + override val sourceVersionOpt: Some[Long] = Some(beforeDate.timestamp + 1) +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/conversion/BUILD b/timelines/data_processing/ml_util/aggregation_framework/conversion/BUILD new file mode 100644 index 0000000000..d6c86cc12d --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/conversion/BUILD @@ -0,0 +1,71 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "3rdparty/jvm/com/twitter/algebird:core", + "3rdparty/jvm/com/twitter/algebird:util", + "3rdparty/jvm/com/twitter/bijection:core", + "3rdparty/jvm/com/twitter/bijection:json", + "3rdparty/jvm/com/twitter/bijection:netty", + "3rdparty/jvm/com/twitter/bijection:scrooge", + "3rdparty/jvm/com/twitter/bijection:thrift", + "3rdparty/jvm/com/twitter/bijection:util", + "3rdparty/jvm/com/twitter/storehaus:algebra", + "3rdparty/jvm/com/twitter/storehaus:core", + "3rdparty/src/jvm/com/twitter/scalding:commons", + "3rdparty/src/jvm/com/twitter/scalding:core", + "3rdparty/src/jvm/com/twitter/scalding:date", + "3rdparty/src/jvm/com/twitter/summingbird:batch", + "3rdparty/src/jvm/com/twitter/summingbird:core", + "src/java/com/twitter/ml/api:api-base", + "src/java/com/twitter/ml/api/constant", + "src/scala/com/twitter/ml/api:api-base", + "src/scala/com/twitter/ml/api/util", + "src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + "src/thrift/com/twitter/ml/api:data-java", + "src/thrift/com/twitter/ml/api:interpretable-model-java", + "src/thrift/com/twitter/summingbird", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + "timelines/data_processing/ml_util/aggregation_framework/metrics", + "util/util-core:scala", + ], +) + +scala_library( + name = "for-timelines", + sources = [ + "CombineCountsPolicy.scala", + "SparseBinaryMergePolicy.scala", + ], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "3rdparty/jvm/com/twitter/algebird:core", + "3rdparty/jvm/com/twitter/algebird:util", + "3rdparty/jvm/com/twitter/bijection:core", + "3rdparty/jvm/com/twitter/bijection:json", + "3rdparty/jvm/com/twitter/bijection:netty", + "3rdparty/jvm/com/twitter/bijection:scrooge", + "3rdparty/jvm/com/twitter/bijection:thrift", + "3rdparty/jvm/com/twitter/bijection:util", + "3rdparty/jvm/com/twitter/storehaus:algebra", + "3rdparty/jvm/com/twitter/storehaus:core", + "3rdparty/src/jvm/com/twitter/scalding:commons", + "3rdparty/src/jvm/com/twitter/scalding:core", + "3rdparty/src/jvm/com/twitter/scalding:date", + "3rdparty/src/jvm/com/twitter/summingbird:batch", + "3rdparty/src/jvm/com/twitter/summingbird:core", + "src/java/com/twitter/ml/api:api-base", + "src/java/com/twitter/ml/api/constant", + "src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + "src/thrift/com/twitter/ml/api:data-java", + "src/thrift/com/twitter/ml/api:interpretable-model-java", + "src/thrift/com/twitter/summingbird", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + "timelines/data_processing/ml_util/aggregation_framework/metrics", + "util/util-core:scala", + ], +) diff --git a/timelines/data_processing/ml_util/aggregation_framework/conversion/CombineCountsPolicy.scala b/timelines/data_processing/ml_util/aggregation_framework/conversion/CombineCountsPolicy.scala new file mode 100644 index 0000000000..eb16902310 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/conversion/CombineCountsPolicy.scala @@ -0,0 +1,223 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion + +import com.google.common.annotations.VisibleForTesting +import com.twitter.ml.api.util.SRichDataRecord +import com.twitter.ml.api.FeatureContext +import com.twitter.ml.api._ +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.TypedCountMetric +import java.lang.{Double => JDouble} +import scala.collection.JavaConverters._ + +case class CombinedFeatures( + sum: Feature[JDouble], + nonzero: Feature[JDouble], + mean: Feature[JDouble], + topK: Seq[Feature[JDouble]]) + +trait CombineCountsBase { + val SparseSum = "sparse_sum" + val SparseNonzero = "sparse_nonzero" + val SparseMean = "sparse_mean" + val SparseTop = "sparse_top" + + def topK: Int + def hardLimit: Option[Int] + def precomputedCountFeatures: Seq[Feature[_]] + + lazy val precomputedFeaturesMap: Map[Feature[_], CombinedFeatures] = + precomputedCountFeatures.map { countFeature => + val derivedPersonalDataTypes = + AggregationMetricCommon.derivePersonalDataTypes(Some(countFeature)) + val sum = new Feature.Continuous( + countFeature.getDenseFeatureName + "." + SparseSum, + derivedPersonalDataTypes) + val nonzero = new Feature.Continuous( + countFeature.getDenseFeatureName + "." + SparseNonzero, + derivedPersonalDataTypes) + val mean = new Feature.Continuous( + countFeature.getDenseFeatureName + "." + SparseMean, + derivedPersonalDataTypes) + val topKFeatures = (1 to topK).map { k => + new Feature.Continuous( + countFeature.getDenseFeatureName + "." + SparseTop + k, + derivedPersonalDataTypes) + } + (countFeature, CombinedFeatures(sum, nonzero, mean, topKFeatures)) + }.toMap + + lazy val outputFeaturesPostMerge: Set[Feature[JDouble]] = + precomputedFeaturesMap.values.flatMap { combinedFeatures: CombinedFeatures => + Seq( + combinedFeatures.sum, + combinedFeatures.nonzero, + combinedFeatures.mean + ) ++ combinedFeatures.topK + }.toSet + + private case class ComputedStats(sum: Double, nonzero: Double, mean: Double) + + private def preComputeStats(featureValues: Seq[Double]): ComputedStats = { + val (sum, nonzero) = featureValues.foldLeft((0.0, 0.0)) { + case ((accSum, accNonzero), value) => + (accSum + value, if (value > 0.0) accNonzero + 1.0 else accNonzero) + } + ComputedStats(sum, nonzero, if (nonzero > 0.0) sum / nonzero else 0.0) + } + + private def computeSortedFeatureValues(featureValues: List[Double]): List[Double] = + featureValues.sortBy(-_) + + private def extractKth(sortedFeatureValues: Seq[Double], k: Int): Double = + sortedFeatureValues + .lift(k - 1) + .getOrElse(0.0) + + private def setContinuousFeatureIfNonZero( + record: SRichDataRecord, + feature: Feature[JDouble], + value: Double + ): Unit = + if (value != 0.0) { + record.setFeatureValue(feature, value) + } + + def hydrateCountFeatures( + richRecord: SRichDataRecord, + features: Seq[Feature[_]], + featureValuesMap: Map[Feature[_], List[Double]] + ): Unit = + for { + feature <- features + featureValues <- featureValuesMap.get(feature) + } { + mergeRecordFromCountFeature( + countFeature = feature, + featureValues = featureValues, + richInputRecord = richRecord + ) + } + + def mergeRecordFromCountFeature( + richInputRecord: SRichDataRecord, + countFeature: Feature[_], + featureValues: List[Double] + ): Unit = { + // In majority of calls to this method from timeline scorer + // the featureValues list is empty. + // While with empty list each operation will be not that expensive, these + // small things do add up. By adding early stop here we can avoid sorting + // empty list, allocating several options and making multiple function + // calls. In addition to that, we won't iterate over [1, topK]. + if (featureValues.nonEmpty) { + val sortedFeatureValues = hardLimit + .map { limit => + computeSortedFeatureValues(featureValues).take(limit) + }.getOrElse(computeSortedFeatureValues(featureValues)).toIndexedSeq + val computed = preComputeStats(sortedFeatureValues) + + val combinedFeatures = precomputedFeaturesMap(countFeature) + setContinuousFeatureIfNonZero( + richInputRecord, + combinedFeatures.sum, + computed.sum + ) + setContinuousFeatureIfNonZero( + richInputRecord, + combinedFeatures.nonzero, + computed.nonzero + ) + setContinuousFeatureIfNonZero( + richInputRecord, + combinedFeatures.mean, + computed.mean + ) + (1 to topK).foreach { k => + setContinuousFeatureIfNonZero( + richInputRecord, + combinedFeatures.topK(k - 1), + extractKth(sortedFeatureValues, k) + ) + } + } + } +} + +object CombineCountsPolicy { + def getCountFeatures(aggregateContext: FeatureContext): Seq[Feature[_]] = + aggregateContext.getAllFeatures.asScala.toSeq + .filter { feature => + feature.getFeatureType == FeatureType.CONTINUOUS && + feature.getDenseFeatureName.endsWith(TypedCountMetric[JDouble]().operatorName) + } + + @VisibleForTesting + private[conversion] def getFeatureValues( + dataRecordsWithCounts: List[DataRecord], + countFeature: Feature[_] + ): List[Double] = + dataRecordsWithCounts.map(new SRichDataRecord(_)).flatMap { record => + Option(record.getFeatureValue(countFeature)).map(_.asInstanceOf[JDouble].toDouble) + } +} + +/** + * A merge policy that works whenever all aggregate features are + * counts (computed using CountMetric), and typically represent + * either impressions or engagements. For each such input count + * feature, the policy outputs the following (3+k) derived features + * into the output data record: + * + * Sum of the feature's value across all aggregate records + * Number of aggregate records that have the feature set to non-zero + * Mean of the feature's value across all aggregate records + * topK values of the feature across all aggregate records + * + * @param topK topK values to compute + * @param hardLimit when set, records are sorted and only the top values will be used for aggregation if + * the number of records are higher than this hard limit. + */ +case class CombineCountsPolicy( + override val topK: Int, + aggregateContextToPrecompute: FeatureContext, + override val hardLimit: Option[Int] = None) + extends SparseBinaryMergePolicy + with CombineCountsBase { + import CombineCountsPolicy._ + override val precomputedCountFeatures: Seq[Feature[_]] = getCountFeatures( + aggregateContextToPrecompute) + + override def mergeRecord( + mutableInputRecord: DataRecord, + aggregateRecords: List[DataRecord], + aggregateContext: FeatureContext + ): Unit = { + // Assumes aggregateContext === aggregateContextToPrecompute + mergeRecordFromCountFeatures(mutableInputRecord, aggregateRecords, precomputedCountFeatures) + } + + def defaultMergeRecord( + mutableInputRecord: DataRecord, + aggregateRecords: List[DataRecord] + ): Unit = { + mergeRecordFromCountFeatures(mutableInputRecord, aggregateRecords, precomputedCountFeatures) + } + + def mergeRecordFromCountFeatures( + mutableInputRecord: DataRecord, + aggregateRecords: List[DataRecord], + countFeatures: Seq[Feature[_]] + ): Unit = { + val richInputRecord = new SRichDataRecord(mutableInputRecord) + countFeatures.foreach { countFeature => + mergeRecordFromCountFeature( + richInputRecord = richInputRecord, + countFeature = countFeature, + featureValues = getFeatureValues(aggregateRecords, countFeature) + ) + } + } + + override def aggregateFeaturesPostMerge(aggregateContext: FeatureContext): Set[Feature[_]] = + outputFeaturesPostMerge.map(_.asInstanceOf[Feature[_]]) +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/conversion/DataSetPipeSketchJoin.scala b/timelines/data_processing/ml_util/aggregation_framework/conversion/DataSetPipeSketchJoin.scala new file mode 100644 index 0000000000..8d3dd58bbc --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/conversion/DataSetPipeSketchJoin.scala @@ -0,0 +1,46 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion + +import com.twitter.bijection.Injection +import com.twitter.ml.api._ +import com.twitter.ml.api.util.SRichDataRecord +import com.twitter.scalding.TypedPipe + +object DataSetPipeSketchJoin { + val DefaultSketchNumReducers = 500 + val dataRecordMerger: DataRecordMerger = new DataRecordMerger + implicit val str2Byte: String => Array[Byte] = + implicitly[Injection[String, Array[Byte]]].toFunction + + /* Computes a left sketch join on a set of skewed keys. */ + def apply( + inputDataSet: DataSetPipe, + skewedJoinKeys: Product, + joinFeaturesDataSet: DataSetPipe, + sketchNumReducers: Int = DefaultSketchNumReducers + ): DataSetPipe = { + val joinKeyList = skewedJoinKeys.productIterator.toList.asInstanceOf[List[Feature[_]]] + + def makeKey(record: DataRecord): String = + joinKeyList + .map(SRichDataRecord(record).getFeatureValue(_)) + .toString + + def byKey(pipe: DataSetPipe): TypedPipe[(String, DataRecord)] = + pipe.records.map(record => (makeKey(record), record)) + + val joinedRecords = byKey(inputDataSet) + .sketch(sketchNumReducers) + .leftJoin(byKey(joinFeaturesDataSet)) + .values + .map { + case (inputRecord, joinFeaturesOpt) => + joinFeaturesOpt.foreach { joinRecord => dataRecordMerger.merge(inputRecord, joinRecord) } + inputRecord + } + + DataSetPipe( + joinedRecords, + FeatureContext.merge(inputDataSet.featureContext, joinFeaturesDataSet.featureContext) + ) + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/conversion/PickFirstRecordPolicy.scala b/timelines/data_processing/ml_util/aggregation_framework/conversion/PickFirstRecordPolicy.scala new file mode 100644 index 0000000000..b022d35b01 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/conversion/PickFirstRecordPolicy.scala @@ -0,0 +1,26 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion + +import com.twitter.ml.api._ +import com.twitter.ml.api.FeatureContext +import scala.collection.JavaConverters._ + +/* + * A really bad default merge policy that picks all the aggregate + * features corresponding to the first sparse key value in the list. + * Does not rename any of the aggregate features for simplicity. + * Avoid using this merge policy if at all possible. + */ +object PickFirstRecordPolicy extends SparseBinaryMergePolicy { + val dataRecordMerger: DataRecordMerger = new DataRecordMerger + + override def mergeRecord( + mutableInputRecord: DataRecord, + aggregateRecords: List[DataRecord], + aggregateContext: FeatureContext + ): Unit = + aggregateRecords.headOption + .foreach(aggregateRecord => dataRecordMerger.merge(mutableInputRecord, aggregateRecord)) + + override def aggregateFeaturesPostMerge(aggregateContext: FeatureContext): Set[Feature[_]] = + aggregateContext.getAllFeatures.asScala.toSet +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/conversion/PickTopCtrPolicy.scala b/timelines/data_processing/ml_util/aggregation_framework/conversion/PickTopCtrPolicy.scala new file mode 100644 index 0000000000..94d3ac126c --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/conversion/PickTopCtrPolicy.scala @@ -0,0 +1,226 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion + +import com.twitter.ml.api._ +import com.twitter.ml.api.FeatureContext +import com.twitter.ml.api.util.SRichDataRecord +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon +import java.lang.{Boolean => JBoolean} +import java.lang.{Double => JDouble} + +case class CtrDescriptor( + engagementFeature: Feature[JDouble], + impressionFeature: Feature[JDouble], + outputFeature: Feature[JDouble]) + +object PickTopCtrBuilderHelper { + + def createCtrDescriptors( + aggregatePrefix: String, + engagementLabels: Set[Feature[JBoolean]], + aggregatesToCompute: Set[TypedAggregateGroup[_]], + outputSuffix: String + ): Set[CtrDescriptor] = { + val aggregateFeatures = aggregatesToCompute + .filter(_.aggregatePrefix == aggregatePrefix) + + val impressionFeature = aggregateFeatures + .flatMap { group => + group.individualAggregateDescriptors + .filter(_.query.feature == None) + .filter(_.query.label == None) + .flatMap(_.outputFeatures) + } + .head + .asInstanceOf[Feature[JDouble]] + + val aggregateEngagementFeatures = + aggregateFeatures + .flatMap { group => + group.individualAggregateDescriptors + .filter(_.query.feature == None) + .filter { descriptor => + //TODO: we should remove the need to pass around engagementLabels and just use all the labels available. + descriptor.query.label.exists(engagementLabels.contains(_)) + } + .flatMap(_.outputFeatures) + } + .map(_.asInstanceOf[Feature[JDouble]]) + + aggregateEngagementFeatures + .map { aggregateEngagementFeature => + CtrDescriptor( + engagementFeature = aggregateEngagementFeature, + impressionFeature = impressionFeature, + outputFeature = new Feature.Continuous( + aggregateEngagementFeature.getDenseFeatureName + "." + outputSuffix, + AggregationMetricCommon.derivePersonalDataTypes( + Some(aggregateEngagementFeature), + Some(impressionFeature) + ) + ) + ) + } + } +} + +object PickTopCtrPolicy { + def build( + aggregatePrefix: String, + engagementLabels: Set[Feature[JBoolean]], + aggregatesToCompute: Set[TypedAggregateGroup[_]], + smoothing: Double = 1.0, + outputSuffix: String = "ratio" + ): PickTopCtrPolicy = { + val ctrDescriptors = PickTopCtrBuilderHelper.createCtrDescriptors( + aggregatePrefix = aggregatePrefix, + engagementLabels = engagementLabels, + aggregatesToCompute = aggregatesToCompute, + outputSuffix = outputSuffix + ) + PickTopCtrPolicy( + ctrDescriptors = ctrDescriptors, + smoothing = smoothing + ) + } +} + +object CombinedTopNCtrsByWilsonConfidenceIntervalPolicy { + def build( + aggregatePrefix: String, + engagementLabels: Set[Feature[JBoolean]], + aggregatesToCompute: Set[TypedAggregateGroup[_]], + outputSuffix: String = "ratioWithWCI", + z: Double = 1.96, + topN: Int = 1 + ): CombinedTopNCtrsByWilsonConfidenceIntervalPolicy = { + val ctrDescriptors = PickTopCtrBuilderHelper.createCtrDescriptors( + aggregatePrefix = aggregatePrefix, + engagementLabels = engagementLabels, + aggregatesToCompute = aggregatesToCompute, + outputSuffix = outputSuffix + ) + CombinedTopNCtrsByWilsonConfidenceIntervalPolicy( + ctrDescriptors = ctrDescriptors, + z = z, + topN = topN + ) + } +} + +/* + * A merge policy that picks the aggregate features corresponding to + * the sparse key value with the highest engagement rate (defined + * as the ratio of two specified features, representing engagements + * and impressions). Also outputs the engagement rate to the specified + * outputFeature. + * + * This is an abstract class. We can make variants of this policy by overriding + * the calculateCtr method. + */ + +abstract class PickTopCtrPolicyBase(ctrDescriptors: Set[CtrDescriptor]) + extends SparseBinaryMergePolicy { + + private def getContinuousFeature( + aggregateRecord: DataRecord, + feature: Feature[JDouble] + ): Double = { + Option(SRichDataRecord(aggregateRecord).getFeatureValue(feature)) + .map(_.asInstanceOf[JDouble].toDouble) + .getOrElse(0.0) + } + + /** + * For every provided descriptor, compute the corresponding CTR feature + * and only hydrate this result to the provided input record. + */ + override def mergeRecord( + mutableInputRecord: DataRecord, + aggregateRecords: List[DataRecord], + aggregateContext: FeatureContext + ): Unit = { + ctrDescriptors + .foreach { + case CtrDescriptor(engagementFeature, impressionFeature, outputFeature) => + val sortedCtrs = + aggregateRecords + .map { aggregateRecord => + val impressions = getContinuousFeature(aggregateRecord, impressionFeature) + val engagements = getContinuousFeature(aggregateRecord, engagementFeature) + calculateCtr(impressions, engagements) + } + .sortBy { ctr => -ctr } + combineTopNCtrsToSingleScore(sortedCtrs) + .foreach { score => + SRichDataRecord(mutableInputRecord).setFeatureValue(outputFeature, score) + } + } + } + + protected def calculateCtr(impressions: Double, engagements: Double): Double + + protected def combineTopNCtrsToSingleScore(sortedCtrs: Seq[Double]): Option[Double] + + override def aggregateFeaturesPostMerge(aggregateContext: FeatureContext): Set[Feature[_]] = + ctrDescriptors + .map(_.outputFeature) + .toSet +} + +case class PickTopCtrPolicy(ctrDescriptors: Set[CtrDescriptor], smoothing: Double = 1.0) + extends PickTopCtrPolicyBase(ctrDescriptors) { + require(smoothing > 0.0) + + override def calculateCtr(impressions: Double, engagements: Double): Double = + (1.0 * engagements) / (smoothing + impressions) + + override def combineTopNCtrsToSingleScore(sortedCtrs: Seq[Double]): Option[Double] = + sortedCtrs.headOption +} + +case class CombinedTopNCtrsByWilsonConfidenceIntervalPolicy( + ctrDescriptors: Set[CtrDescriptor], + z: Double = 1.96, + topN: Int = 1) + extends PickTopCtrPolicyBase(ctrDescriptors) { + + private val zSquared = z * z + private val zSquaredDiv2 = zSquared / 2.0 + private val zSquaredDiv4 = zSquared / 4.0 + + /** + * calculates the lower bound of wilson score interval. which roughly says "the actual engagement + * rate is at least this value" with confidence designated by the z-score: + * https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval + */ + override def calculateCtr(rawImpressions: Double, engagements: Double): Double = { + // just in case engagements happens to be more than impressions... + val impressions = Math.max(rawImpressions, engagements) + + if (impressions > 0.0) { + val p = engagements / impressions + (p + + zSquaredDiv2 / impressions + - z * Math.sqrt( + (p * (1.0 - p) + zSquaredDiv4 / impressions) / impressions)) / (1.0 + zSquared / impressions) + + } else 0.0 + } + + /** + * takes the topN engagement rates, and returns the joint probability as {1.0 - Π(1.0 - p)} + * + * e.g. let's say you have 0.6 chance of clicking on a tweet shared by the user A. + * you also have 0.3 chance of clicking on a tweet shared by the user B. + * seeing a tweet shared by both A and B will not lead to 0.9 chance of you clicking on it. + * but you could say that you have 0.4*0.7 chance of NOT clicking on that tweet. + */ + override def combineTopNCtrsToSingleScore(sortedCtrs: Seq[Double]): Option[Double] = + if (sortedCtrs.nonEmpty) { + val inverseLogP = sortedCtrs + .take(topN).map { p => Math.log(1.0 - p) }.sum + Some(1.0 - Math.exp(inverseLogP)) + } else None + +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/conversion/SparseBinaryAggregateJoin.scala b/timelines/data_processing/ml_util/aggregation_framework/conversion/SparseBinaryAggregateJoin.scala new file mode 100644 index 0000000000..10c6a90965 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/conversion/SparseBinaryAggregateJoin.scala @@ -0,0 +1,199 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion + +import com.twitter.ml.api._ +import com.twitter.ml.api.Feature +import com.twitter.ml.api.util.SRichDataRecord +import com.twitter.scalding.typed.TypedPipe +import com.twitter.scalding.typed.UnsortedGrouped +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup +import java.util.{Set => JSet} +import scala.collection.JavaConverters._ + +object SparseBinaryAggregateJoin { + import TypedAggregateGroup._ + + def makeKey(record: DataRecord, joinKeyList: List[Feature[_]]): String = { + joinKeyList.map { + case sparseKey: Feature.SparseBinary => + SRichDataRecord(record).getFeatureValue(sparseFeature(sparseKey)) + case nonSparseKey: Feature[_] => + SRichDataRecord(record).getFeatureValue(nonSparseKey) + }.toString + } + + /** + * @param record Data record to get all possible sparse aggregate keys from + * @param List of join key features (some can be sparse and some non-sparse) + * @return A list of string keys to use for joining + */ + def makeKeyPermutations(record: DataRecord, joinKeyList: List[Feature[_]]): List[String] = { + val allIdValues = joinKeyList.flatMap { + case sparseKey: Feature.SparseBinary => { + val id = sparseKey.getDenseFeatureId + val valuesOpt = Option(SRichDataRecord(record).getFeatureValue(sparseKey)) + .map(_.asInstanceOf[JSet[String]].asScala.toSet) + valuesOpt.map { (id, _) } + } + case nonSparseKey: Feature[_] => { + val id = nonSparseKey.getDenseFeatureId + Option(SRichDataRecord(record).getFeatureValue(nonSparseKey)).map { value => + (id, Set(value.toString)) + } + } + } + sparseBinaryPermutations(allIdValues).toList.map { idValues => + joinKeyList.map { key => idValues.getOrElse(key.getDenseFeatureId, "") }.toString + } + } + + private[this] def mkKeyIndexedAggregates( + joinFeaturesDataSet: DataSetPipe, + joinKeyList: List[Feature[_]] + ): TypedPipe[(String, DataRecord)] = + joinFeaturesDataSet.records + .map { record => (makeKey(record, joinKeyList), record) } + + private[this] def mkKeyIndexedInput( + inputDataSet: DataSetPipe, + joinKeyList: List[Feature[_]] + ): TypedPipe[(String, DataRecord)] = + inputDataSet.records + .flatMap { record => + for { + key <- makeKeyPermutations(record, joinKeyList) + } yield { (key, record) } + } + + private[this] def mkKeyIndexedInputWithUniqueId( + inputDataSet: DataSetPipe, + joinKeyList: List[Feature[_]], + uniqueIdFeatureList: List[Feature[_]] + ): TypedPipe[(String, String)] = + inputDataSet.records + .flatMap { record => + for { + key <- makeKeyPermutations(record, joinKeyList) + } yield { (key, makeKey(record, uniqueIdFeatureList)) } + } + + private[this] def mkRecordIndexedAggregates( + keyIndexedInput: TypedPipe[(String, DataRecord)], + keyIndexedAggregates: TypedPipe[(String, DataRecord)] + ): UnsortedGrouped[DataRecord, List[DataRecord]] = + keyIndexedInput + .join(keyIndexedAggregates) + .map { case (_, (inputRecord, aggregateRecord)) => (inputRecord, aggregateRecord) } + .group + .toList + + private[this] def mkRecordIndexedAggregatesWithUniqueId( + keyIndexedInput: TypedPipe[(String, String)], + keyIndexedAggregates: TypedPipe[(String, DataRecord)] + ): UnsortedGrouped[String, List[DataRecord]] = + keyIndexedInput + .join(keyIndexedAggregates) + .map { case (_, (inputId, aggregateRecord)) => (inputId, aggregateRecord) } + .group + .toList + + def mkJoinedDataSet( + inputDataSet: DataSetPipe, + joinFeaturesDataSet: DataSetPipe, + recordIndexedAggregates: UnsortedGrouped[DataRecord, List[DataRecord]], + mergePolicy: SparseBinaryMergePolicy + ): TypedPipe[DataRecord] = + inputDataSet.records + .map(record => (record, ())) + .leftJoin(recordIndexedAggregates) + .map { + case (inputRecord, (_, aggregateRecordsOpt)) => + aggregateRecordsOpt + .map { aggregateRecords => + mergePolicy.mergeRecord( + inputRecord, + aggregateRecords, + joinFeaturesDataSet.featureContext + ) + inputRecord + } + .getOrElse(inputRecord) + } + + def mkJoinedDataSetWithUniqueId( + inputDataSet: DataSetPipe, + joinFeaturesDataSet: DataSetPipe, + recordIndexedAggregates: UnsortedGrouped[String, List[DataRecord]], + mergePolicy: SparseBinaryMergePolicy, + uniqueIdFeatureList: List[Feature[_]] + ): TypedPipe[DataRecord] = + inputDataSet.records + .map(record => (makeKey(record, uniqueIdFeatureList), record)) + .leftJoin(recordIndexedAggregates) + .map { + case (_, (inputRecord, aggregateRecordsOpt)) => + aggregateRecordsOpt + .map { aggregateRecords => + mergePolicy.mergeRecord( + inputRecord, + aggregateRecords, + joinFeaturesDataSet.featureContext + ) + inputRecord + } + .getOrElse(inputRecord) + } + + /** + * If uniqueIdFeatures is non-empty and the join keys include a sparse binary + * key, the join will use this set of keys as a unique id to reduce + * memory consumption. You should need this option only for + * memory-intensive joins to avoid OOM errors. + */ + def apply( + inputDataSet: DataSetPipe, + joinKeys: Product, + joinFeaturesDataSet: DataSetPipe, + mergePolicy: SparseBinaryMergePolicy = PickFirstRecordPolicy, + uniqueIdFeaturesOpt: Option[Product] = None + ): DataSetPipe = { + val joinKeyList = joinKeys.productIterator.toList.asInstanceOf[List[Feature[_]]] + val sparseBinaryJoinKeySet = + joinKeyList.toSet.filter(_.getFeatureType() == FeatureType.SPARSE_BINARY) + val containsSparseBinaryKey = !sparseBinaryJoinKeySet.isEmpty + if (containsSparseBinaryKey) { + val uniqueIdFeatureList = uniqueIdFeaturesOpt + .map(uniqueIdFeatures => + uniqueIdFeatures.productIterator.toList.asInstanceOf[List[Feature[_]]]) + .getOrElse(List.empty[Feature[_]]) + val keyIndexedAggregates = mkKeyIndexedAggregates(joinFeaturesDataSet, joinKeyList) + val joinedDataSet = if (uniqueIdFeatureList.isEmpty) { + val keyIndexedInput = mkKeyIndexedInput(inputDataSet, joinKeyList) + val recordIndexedAggregates = + mkRecordIndexedAggregates(keyIndexedInput, keyIndexedAggregates) + mkJoinedDataSet(inputDataSet, joinFeaturesDataSet, recordIndexedAggregates, mergePolicy) + } else { + val keyIndexedInput = + mkKeyIndexedInputWithUniqueId(inputDataSet, joinKeyList, uniqueIdFeatureList) + val recordIndexedAggregates = + mkRecordIndexedAggregatesWithUniqueId(keyIndexedInput, keyIndexedAggregates) + mkJoinedDataSetWithUniqueId( + inputDataSet, + joinFeaturesDataSet, + recordIndexedAggregates, + mergePolicy, + uniqueIdFeatureList + ) + } + + DataSetPipe( + joinedDataSet, + mergePolicy.mergeContext( + inputDataSet.featureContext, + joinFeaturesDataSet.featureContext + ) + ) + } else { + inputDataSet.joinWithSmaller(joinKeys, joinFeaturesDataSet) { _.pass } + } + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/conversion/SparseBinaryMergePolicy.scala b/timelines/data_processing/ml_util/aggregation_framework/conversion/SparseBinaryMergePolicy.scala new file mode 100644 index 0000000000..7201e39a2c --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/conversion/SparseBinaryMergePolicy.scala @@ -0,0 +1,81 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion + +import com.twitter.ml.api._ +import com.twitter.ml.api.FeatureContext +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup +import scala.collection.JavaConverters._ + +/** + * When using the aggregates framework to group by sparse binary keys, + * we generate different aggregate feature values for each possible + * value of the sparse key. Hence, when joining back the aggregate + * features with a training data set, each individual training record + * has multiple aggregate features to choose from, for each value taken + * by the sparse key(s) in the training record. The merge policy trait + * below specifies how to condense/combine this variable number of + * aggregate features into a constant number of features for training. + * Some simple policies might be: pick the first feature set (randomly), + * pick the top sorted by some attribute, or take some average. + * + * Example: suppose we group by (ADVERTISER_ID, INTEREST_ID) where INTEREST_ID + * is the sparse key, and compute a "CTR" aggregate feature for each such + * pair measuring the click through rate on ads with (ADVERTISER_ID, INTEREST_ID). + * Say we have the following aggregate records: + * + * (ADVERTISER_ID = 1, INTEREST_ID = 1, CTR = 5%) + * (ADVERTISER_ID = 1, INTEREST_ID = 2, CTR = 15%) + * (ADVERTISER_ID = 2, INTEREST_ID = 1, CTR = 1%) + * (ADVERTISER_ID = 2, INTEREST_ID = 2, CTR = 10%) + * ... + * At training time, each training record has one value for ADVERTISER_ID, but it + * has multiple values for INTEREST_ID e.g. + * + * (ADVERTISER_ID = 1, INTEREST_IDS = (1,2)) + * + * There are multiple potential CTRs we can get when joining in the aggregate features: + * in this case 2 values (5% and 15%) but in general it could be many depending on how + * many interests the user has. When joining back the CTR features, the merge policy says how to + * combine all these CTRs to engineer features. + * + * "Pick first" would say - pick some random CTR (whatever is first in the list, maybe 5%) + * for training (probably not a good policy). "Sort by CTR" could be a policy + * that just picks the top CTR and uses it as a feature (here 15%). Similarly, you could + * imagine "Top K sorted by CTR" (use both 5 and 15%) or "Avg CTR" (10%) or other policies, + * all of which are defined as objects/case classes that override this trait. + */ +trait SparseBinaryMergePolicy { + + /** + * @param mutableInputRecord Input record to add aggregates to + * @param aggregateRecords Aggregate feature records + * @param aggregateContext Context for aggregate records + */ + def mergeRecord( + mutableInputRecord: DataRecord, + aggregateRecords: List[DataRecord], + aggregateContext: FeatureContext + ): Unit + + def aggregateFeaturesPostMerge(aggregateContext: FeatureContext): Set[Feature[_]] + + /** + * @param inputContext Context for input record + * @param aggregateContext Context for aggregate records + * @return Context for record returned by mergeRecord() + */ + def mergeContext( + inputContext: FeatureContext, + aggregateContext: FeatureContext + ): FeatureContext = new FeatureContext( + (inputContext.getAllFeatures.asScala.toSet ++ aggregateFeaturesPostMerge( + aggregateContext)).toSeq.asJava + ) + + def allOutputFeaturesPostMergePolicy[T](config: TypedAggregateGroup[T]): Set[Feature[_]] = { + val containsSparseBinary = config.keysToAggregate + .exists(_.getFeatureType == FeatureType.SPARSE_BINARY) + + if (!containsSparseBinary) config.allOutputFeatures + else aggregateFeaturesPostMerge(new FeatureContext(config.allOutputFeatures.toSeq.asJava)) + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/conversion/SparseBinaryMultipleAggregateJoin.scala b/timelines/data_processing/ml_util/aggregation_framework/conversion/SparseBinaryMultipleAggregateJoin.scala new file mode 100644 index 0000000000..d0aff7e344 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/conversion/SparseBinaryMultipleAggregateJoin.scala @@ -0,0 +1,109 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion + +import com.twitter.bijection.Injection +import com.twitter.ml.api._ +import com.twitter.ml.api.Feature +import com.twitter.ml.api.util.SRichDataRecord +import com.twitter.scalding.typed.TypedPipe +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup.sparseFeature +import scala.collection.JavaConverters._ + +case class SparseJoinConfig( + aggregates: DataSetPipe, + sparseKey: Feature.SparseBinary, + mergePolicies: SparseBinaryMergePolicy*) + +object SparseBinaryMultipleAggregateJoin { + type CommonMap = (String, ((Feature.SparseBinary, String), DataRecord)) + + def apply( + source: DataSetPipe, + commonKey: Feature[_], + joinConfigs: Set[SparseJoinConfig], + rightJoin: Boolean = false, + isSketchJoin: Boolean = false, + numSketchJoinReducers: Int = 0 + ): DataSetPipe = { + val emptyPipe: TypedPipe[CommonMap] = TypedPipe.empty + val aggregateMaps: Set[TypedPipe[CommonMap]] = joinConfigs.map { joinConfig => + joinConfig.aggregates.records.map { record => + val sparseKeyValue = + SRichDataRecord(record).getFeatureValue(sparseFeature(joinConfig.sparseKey)).toString + val commonKeyValue = SRichDataRecord(record).getFeatureValue(commonKey).toString + (commonKeyValue, ((joinConfig.sparseKey, sparseKeyValue), record)) + } + } + + val commonKeyToAggregateMap = aggregateMaps + .foldLeft(emptyPipe) { + case (union: TypedPipe[CommonMap], next: TypedPipe[CommonMap]) => + union ++ next + } + .group + .toList + .map { + case (commonKeyValue, aggregateTuples) => + (commonKeyValue, aggregateTuples.toMap) + } + + val commonKeyToRecordMap = source.records + .map { record => + val commonKeyValue = SRichDataRecord(record).getFeatureValue(commonKey).toString + (commonKeyValue, record) + } + + // rightJoin is not supported by Sketched, so rightJoin will be ignored if isSketchJoin is set + implicit val string2Byte = (value: String) => Injection[String, Array[Byte]](value) + val intermediateRecords = if (isSketchJoin) { + commonKeyToRecordMap.group + .sketch(numSketchJoinReducers) + .leftJoin(commonKeyToAggregateMap) + .toTypedPipe + } else if (rightJoin) { + commonKeyToAggregateMap + .rightJoin(commonKeyToRecordMap) + .mapValues(_.swap) + .toTypedPipe + } else { + commonKeyToRecordMap.leftJoin(commonKeyToAggregateMap).toTypedPipe + } + + val joinedRecords = intermediateRecords + .map { + case (commonKeyValue, (inputRecord, aggregateTupleMapOpt)) => + aggregateTupleMapOpt.foreach { aggregateTupleMap => + joinConfigs.foreach { joinConfig => + val sparseKeyValues = Option( + SRichDataRecord(inputRecord) + .getFeatureValue(joinConfig.sparseKey) + ).map(_.asScala.toList) + .getOrElse(List.empty[String]) + + val aggregateRecords = sparseKeyValues.flatMap { sparseKeyValue => + aggregateTupleMap.get((joinConfig.sparseKey, sparseKeyValue)) + } + + joinConfig.mergePolicies.foreach { mergePolicy => + mergePolicy.mergeRecord( + inputRecord, + aggregateRecords, + joinConfig.aggregates.featureContext + ) + } + } + } + inputRecord + } + + val joinedFeatureContext = joinConfigs + .foldLeft(source.featureContext) { + case (left, joinConfig) => + joinConfig.mergePolicies.foldLeft(left) { + case (soFar, mergePolicy) => + mergePolicy.mergeContext(soFar, joinConfig.aggregates.featureContext) + } + } + + DataSetPipe(joinedRecords, joinedFeatureContext) + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/docs/AUTOMATED_COMMIT_FILES b/timelines/data_processing/ml_util/aggregation_framework/docs/AUTOMATED_COMMIT_FILES new file mode 100644 index 0000000000..80aaae8d96 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/docs/AUTOMATED_COMMIT_FILES @@ -0,0 +1,5 @@ +aggregation.rst +batch.rst +index.rst +real-time.rst +troubleshooting.rst diff --git a/timelines/data_processing/ml_util/aggregation_framework/docs/aggregation.rst b/timelines/data_processing/ml_util/aggregation_framework/docs/aggregation.rst new file mode 100644 index 0000000000..fddd926b43 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/docs/aggregation.rst @@ -0,0 +1,167 @@ +.. _aggregation: + +Core Concepts +============= + +This page provides an overview of the aggregation framework and goes through examples on how to define aggregate features. In general, we can think of an aggregate feature as a grouped set of records, on which we incrementally update the aggregate feature values, crossed by the provided features and conditional on the provided labels. + +AggregateGroup +-------------- + +An `AggregateGroup` defines a single unit of aggregate computation, similar to a SQL query. These are executed by the underlying jobs (internally, a `DataRecordAggregationMonoid `_ is applied to `DataRecords` that contain the features to aggregate). Many of these groups can exist to define different types of aggregate features. + +Let's start with the following examples of an `AggregateGroup` to discuss the meaning of each of its constructor arguments: + +.. code-block:: scala + + val UserAggregateStore = "user_aggregates" + val aggregatesToCompute: Set[TypedAggregateGroup[_]] = Set( + AggregateGroup( + inputSource = timelinesDailyRecapSource, + aggregatePrefix = "user_aggregate_v2", + preTransformOpt = Some(RemoveUserIdZero), + keys = Set(USER_ID), + features = Set(HAS_PHOTO), + labels = Set(IS_FAVORITED), + metrics = Set(CountMetric, SumMetric), + halfLives = Set(50.days), + outputStore = OfflineAggregateStore( + name = UserAggregateStore, + startDate = "2016-07-15 00:00", + commonConfig = timelinesDailyAggregateSink, + batchesToKeep = 5 + ) + ) + .flatMap(_.buildTypedAggregateGroups) + ) + +This `AggregateGroup` computes the number of times each user has faved a tweet with a photo. The aggregate count is decayed with a 50 day halflife. + +Naming and preprocessing +------------------------ + +`UserAggregateStore` is a string val that acts as a scope of a "root path" to which this group of aggregate features will be written. The root path is provided separately by the implementing job. + +`inputSource` defines the input source of `DataRecords` that we aggregate on. These records contain the relevant features required for aggregation. + +`aggregatePrefix` tells the framework what prefix to use for the aggregate features it generates. A descriptive naming scheme with versioning makes it easier to maintain features as you add or remove them over the long-term. + +`preTransforms` is a `Seq[com.twitter.ml.api.ITransform] `_ that can be applied to the data records read from the input source before they are fed into the `AggregateGroup` to apply aggregation. These transforms are optional but can be useful for certain preprocessing operations for a group's raw input features. + +.. admonition:: Examples + + You can downsample input data records by providing `preTransforms`. In addition, you could also join different input labels (e.g. "is_push_openend" and "is_push_favorited") and transform them into a combined label that is their union ("is_push_engaged") on which aggregate counts will be calculated. + + +Keys +---- + +`keys` is a crucial field in the config. It defines a `Set[com.twitter.ml.api.Feature]` which specifies a set of grouping keys to use for this `AggregateGroup`. + +Keys can only be of 3 supported types currently: `DISCRETE`, `STRING` and `SPARSE_BINARY`. Using a discrete or a string/text feature as a key specifies the unit to group records by before applying counting/aggregation operators. + + +.. admonition:: Examples + + .. cssclass:: shortlist + + #. If the key is `USER_ID`, this tells the framework to group all records by `USER_ID`, and then apply aggregations (sum/count/etc) within each user’s data to generate aggregate features for each user. + + #. If the key is `(USER_ID, AUTHOR_ID)`, then the `AggregateGroup` will output features for each unique user-author pair in the input data. + + #. Finally, using a sparse binary feature as key has special "flattening" or "flatMap" like semantics. For example, consider grouping by `(USER_ID, AUTHOR_INTEREST_IDS)` where `AUTHOR_INTEREST_IDS` is a sparse binary feature which represents a set of topic IDs the author may be tweeting about. This creates one record for each `(user_id, interest_id)` pair - so each record with multiple author interests is flattened before feeding it to the aggregation. + +Features +-------- + +`features` specifies a `Set[com.twitter.ml.api.Feature]` to aggregate within each group (defined by the keys specified earlier). + +We support 2 types of `features`: `BINARY` and `CONTINUOUS`. + +The semantics of how the aggregation works is slightly different based on the type of “feature”, and based on the “metric” (or aggregation operation): + +.. cssclass:: shortlist + +#. Binary Feature, Count Metric: Suppose we have a binary feature `HAS_PHOTO` in this set, and are applying the “Count” metric (see below for more details on the metrics), with key `USER_ID`. The semantics is that this computes a feature which measures the count of records with `HAS_PHOTO` set to true for each user. + +#. Binary Feature, Sum Metric - Does not apply. No feature will be computed. + +#. Continuous Feature, Count Metric - The count metric treats all features as binary features ignoring their value. For example, suppose we have a continuous feature `NUM_CHARACTERS_IN_TWEET`, and key `USER_ID`. This measures the count of records that have this feature `NUM_CHARACTERS_IN_TWEET` present. + +#. Continuous Feature, Sum Metric - In the above example, the features measures the sum of (num_characters_in_tweet) over all a user’s records. Dividing this sum feature by the count feature would give the average number of characters in all tweets. + +.. admonition:: Unsupported feature types + + `DISCRETE` and `SPARSE` features are not supported by the Sum Metric, because there is no meaning in summing a discrete feature or a sparse feature. You can use them with the CountMetric, but they may not do what you would expect since they will be treated as binary features losing all the information within the feature. The best way to use these is as “keys” and not as “features”. + +.. admonition:: Setting includeAnyFeature + + If constructor argument `includeAnyFeature` is set, the framework will append a feature with scope `any_feature` to the set of all features you define. This additional feature simply measures the total count of records. So if you set your features to be equal to Set.empty, this will measure the count of records for a given `USER_ID`. + +Labels +------ + +`labels` specifies a set of `BINARY` features that you can cross with, prior to applying aggregations on the `features`. This essentially restricts the aggregate computation to a subset of the records within a particular key. + +We typically use this to represent engagement labels in an ML model, in this case, `IS_FAVORITED`. + +In this example, we are grouping by `USER_ID`, the feature is `HAS_PHOTO`, the label is `IS_FAVORITED`, and we are computing `CountMetric`. The system will output a feature for each user that represents the number of favorites on tweets having photos by this `userId`. + +.. admonition:: Setting includeAnyLabel + + If constructor argument `includeAnyLabel` is set (as it is by default), then similar to `any_feature`, the framework automatically appends a label of type `any_label` to the set of all labels you define, which represents not applying any filter or cross. + +In this example, `any_label` and `any_feature` are set by default and the system would actually output 4 features for each `user_id`: + +.. cssclass:: shortlist + +#. The number of `IS_FAVORITED` (favorites) on tweet impressions having `HAS_PHOTO=true` + +#. The number of `IS_FAVORITED` (favorites) on all tweet impressions (`any_feature` aggregate) + +#. The number of tweet impressions having `HAS_PHOTO=true` (`any_label` aggregate) + +#. The total number of tweet impressions for this user id (`any_feature.any_label` aggregate) + +.. admonition:: Disabling includeAnyLabel + + To disable this automatically generated feature you can use `includeAnyLabel = false` in your config. This will remove some useful features (particularly for counterfactual signal), but it can greatly save on space since it does not store every possible impressed set of keys in the output store. So use this if you are short on space, but not otherwise. + +Metrics +------- + +`metrics` specifies the aggregate operators to apply. The most commonly used are `Count`, `Sum` and `SumSq`. + +As mentioned before, `Count` can be applied to all types of features, but treats every feature as binary and ignores the value of the feature. `Sum` and `SumSq` can only be applied to Continuous features - they will ignore all other features you specify. By combining sum and sumsq and count, you can produce powerful “z-score” features or other distributional features using a post-transform. + +It is also possible to add your own aggregate operators (e.g. `LastResetMetric `_) to the framework with some additional work. + +HalfLives +--------- + +`halfLives` specifies how fast aggregate features should be decayed. It is important to note that the framework works on an incremental basis: in the batch implementation, the summingbird-scalding job takes in the most recently computed aggregate features, processed on data until day `N-1`, then reads new data records for day `N` and computes updated values of the aggregate features. Similarly, the decay of real-time aggregate features takes the actual time delta between the current time and the last time the aggregate feature value was updated. + +The halflife `H` specifies how fast to decay old sums/counts to simulate a sliding window of counts. The implementation is such that it will take `H` amount of time to decay an aggregate feature to half its initial value. New observed values of sums/counts are added to the aggregate feature value. + +.. admonition:: Batch and real-time + + In the batch use case where aggregate features are recomputed on a daily basis, we typically take halflives on the order of weeks or longer (in Timelines, 50 days). In the real-time use case, shorter halflives are appropriate (hours) since they are updated as client engagements are received by the summingbird job. + + +SQL Equivalent +-------------- +Conceptually, you can also think of it as: + +.. code-block:: sql + + INSERT INTO . + SELECT AGG() /* AGG is , which is a exponentially decaying SUM or COUNT etc. based on the halfLifves */ + FROM ( + SELECT preTransformOpt(*) FROM + ) + GROUP BY + WHERE = True + +any_features is AGG(*). + +any_labels removes the WHERE clause. \ No newline at end of file diff --git a/timelines/data_processing/ml_util/aggregation_framework/docs/batch.rst b/timelines/data_processing/ml_util/aggregation_framework/docs/batch.rst new file mode 100644 index 0000000000..f3b6ac9a50 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/docs/batch.rst @@ -0,0 +1,215 @@ +.. _batch: + +Batch aggregate feature jobs +============================ + +In the previous section, we went over the core concepts of the aggregation framework and discussed how you can set up you own `AggregateGroups` to compute aggregate features. + +Given these groups, this section will discuss how you can setup offline batch jobs to produce the corresponding aggregate features, updated daily. To accomplish this, we need to setup a summingbird-scalding job that is pointed to the input data records containing features and labels to be aggregated. + +Input Data +---------- + +In order to generate aggregate features, the relevant input features need to be available offline as a daily scalding source in `DataRecord` format (typically `DailySuffixFeatureSource `_, though `HourlySuffixFeatureSource` could also be usable but we have not tested this). + +.. admonition:: Note + + The input data source should contain the keys, features and labels you want to use in your `AggregateGroups`. + +Aggregation Config +------------------ + +Now that we have a daily data source with input features and labels, we need to setup the `AggregateGroup` config itself. This contains all aggregation groups that you would like to compute and we will go through the implementation step-by-step. + +.. admonition:: Example: Timelines Quality config + + `TimelinesAggregationConfig `_ imports the configured `AggregationGroups` from `TimelinesAggregationConfigDetails `_. The config is then referenced by the implementing summingbird-scalding job which we will setup below. + +OfflineAggregateSource +---------------------- + +Each `AggregateGroup` will need to define a (daily) source of input features. We use `OfflineAggregateSource` for this to tell the aggregation framework where the input data set is and the required timestamp feature that the framework uses to decay aggregate feature values: + +.. code-block:: scala + + val timelinesDailyRecapSource = OfflineAggregateSource( + name = "timelines_daily_recap", + timestampFeature = TIMESTAMP, + scaldingHdfsPath = Some("/user/timelines/processed/suggests/recap/data_records"), + scaldingSuffixType = Some("daily"), + withValidation = true + ) + +.. admonition:: Note + + .. cssclass:: shortlist + + #. The name is not important as long as it is unique. + + #. `timestampFeature` must be a discrete feature of type `com.twitter.ml.api.Feature[Long]` and represents the “time” of a given training record in milliseconds - for example, the time at which an engagement, push open event, or abuse event took place that you are trying to train on. If you do not already have such a feature in your daily training data, you need to add one. + + #. `scaldingSuffixType` can be “hourly” or “daily” depending on the type of source (`HourlySuffixFeatureSource` vs `DailySuffixFeatureSource`). + + #. Set `withValidation` to true to validate the presence of _SUCCESS file. Context: https://jira.twitter.biz/browse/TQ-10618 + +Output HDFS store +----------------- + +The output HDFS store is where the computed aggregate features are stored. This store contains all computed aggregate feature values and is incrementally updated by the aggregates job every day. + +.. code-block:: scala + + val outputHdfsPath = "/user/timelines/processed/aggregates_v2" + val timelinesOfflineAggregateSink = new OfflineStoreCommonConfig { + override def apply(startDate: String) = new OfflineAggregateStoreCommonConfig( + outputHdfsPathPrefix = outputHdfsPath, + dummyAppId = "timelines_aggregates_v2_ro", // unused - can be arbitrary + dummyDatasetPrefix = "timelines_aggregates_v2_ro", // unused - can be arbitrary + startDate = startDate + ) + } + +Note: `dummyAppId` and `dummyDatasetPrefix` are unused so can be set to any arbitrary value. They should be removed on the framework side. + +The `outputHdfsPathPrefix` is the only field that matters, and should be set to the HDFS path where you want to store the aggregate features. Make sure you have a lot of quota available at that path. + +Setting Up Aggregates Job +------------------------- + +Once you have defined a config file with the aggregates you would like to compute, the next step is to create the aggregates scalding job using the config (`example `_). This is very concise and requires only a few lines of code: + +.. code-block:: scala + + object TimelinesAggregationScaldingJob extends AggregatesV2ScaldingJob { + override val aggregatesToCompute = TimelinesAggregationConfig.aggregatesToCompute + } + +Now that the scalding job is implemented with the aggregation config, we need to setup a capesos config similar to https://cgit.twitter.biz/source/tree/science/scalding/mesos/timelines/prod.yml: + +.. code-block:: scala + + # Common configuration shared by all aggregates v2 jobs + __aggregates_v2_common__: &__aggregates_v2_common__ + class: HadoopSummingbirdProducer + bundle: offline_aggregation-deploy.tar.gz + mainjar: offline_aggregation-deploy.jar + pants_target: "bundle timelines/data_processing/ad_hoc/aggregate_interactions/v2/offline_aggregation:bin" + cron_collision_policy: CANCEL_NEW + use_libjar_wild_card: true + +.. code-block:: scala + + # Specific job computing user aggregates + user_aggregates_v2: + <<: *__aggregates_v2_common__ + cron_schedule: "25 * * * *" + arguments: --batches 1 --output_stores user_aggregates --job_name timelines_user_aggregates_v2 + +.. admonition:: Important + + Each AggregateGroup in your config should have its own associated offline job which specifies `output_stores` pointing to the output store name you defined in your config. + +Running The Job +--------------- + +When you run the batch job for the first time, you need to add a temporary entry to your capesos yml file that looks like this: + +.. code-block:: scala + + user_aggregates_v2_initial_run: + <<: *__aggregates_v2_common__ + cron_schedule: "25 * * * *" + arguments: --batches 1 --start-time “2017-03-03 00:00:00” --output_stores user_aggregates --job_name timelines_user_aggregates_v2 + +.. admonition:: Start Time + + The additional `--start-time` argument should match the `startDate` in your config for that AggregateGroup, but in the format `yyyy-mm-dd hh:mm:ss`. + +To invoke the initial run via capesos, we would do the following (in Timelines case): + +.. code-block:: scala + + CAPESOSPY_ENV=prod capesospy-v2 update --build_locally --start_cron user_aggregates_v2_initial_run science/scalding/mesos/timelines/prod.yml + +Once it is running smoothly, you can deschedule the initial run job and delete the temporary entry from your production yml config. + +.. code-block:: scala + + aurora cron deschedule atla/timelines/prod/user_aggregates_v2_initial_run + +Note: deschedule it preemptively to avoid repeatedly overwriting the same initial results + +Then schedule the production job from jenkins using something like this: + +.. code-block:: scala + + CAPESOSPY_ENV=prod capesospy-v2 update user_aggregates_v2 science/scalding/mesos/timelines/prod.yml + +All future runs (2nd onwards) will use the permanent entry in the capesos yml config that does not have the `start-time` specified. + +.. admonition:: Job name has to match + + It's important that the production run should share the same `--job_name` with the initial_run so that eagleeye/statebird knows how to keep track of it correctly. + +Output Aggregate Features +------------------------- + +This scalding job using the example config from the earlier section would output a VersionedKeyValSource to `/user/timelines/processed/aggregates_v2/user_aggregates` on HDFS. + +Note that `/user/timelines/processed/aggregates_v2` is the explicitly defined root path while `user_aggregates` is the output directory of the example `AggregateGroup` defined earlier. The latter can be different for different `AggregateGroups` defined in your config. + + +The VersionedKeyValSource is difficult to use directly in your jobs/offline trainings, but we provide an adapted source `AggregatesV2FeatureSource` that makes it easy to join and use in your jobs: + +.. code-block:: scala + + import com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion._ + + val pipe: DataSetPipe = AggregatesV2FeatureSource( + rootPath = "/user/timelines/processed/aggregates_v2", + storeName = "user_aggregates", + aggregates = TimelinesAggregationConfig.aggregatesToCompute, + trimThreshold = 0 + )(dateRange).read + +Simply replace the `rootPath`, `storeName` and `aggregates` object to whatever you defined. The `trimThreshold` tells the framework to trim all features below a certain cutoff: 0 is a safe default to use to begin with. + +.. admonition:: Usage + + This can now be used like any other `DataSetPipe` in offline ML jobs. You can write out the features to a `DailySuffixFeatureSource`, you can join them with your data offline for trainings, or you can write them to a Manhattan store for serving online. + +Aggregate Features Example +-------------------------- + +Here is an example of sample of the aggregate features we just computed: + +.. code-block:: scala + + user_aggregate_v2.pair.any_label.any_feature.50.days.count: 100.0 + user_aggregate_v2.pair.any_label.tweetsource.is_quote.50.days.count: 30.0 + user_aggregate_v2.pair.is_favorited.any_feature.50.days.count: 10.0 + user_aggregate_v2.pair.is_favorited.tweetsource.is_quote.50.days.count: 6.0 + meta.user_id: 123456789 + +Aggregate feature names match a `prefix.pair.label.feature.half_life.metric` schema and correspond to what was defined in the aggregation config for each of these fields. + +.. admonition:: Example + + In this example, the above features are capturing that userId 123456789L has: + + .. + A 50-day decayed count of 100 training records with any label or feature (“tweet impressions”) + + A 50-day decayed count of 30 records that are “quote tweets” (tweetsource.is_quote = true) + + A 50-day decayed count of 10 records that are favorites on any type of tweet (is_favorited = true) + + A 50-day decayed count of 6 records that are “favorites” on “quote tweets” (both of the above are true) + +By combining the above, a model might infer that for this specific user, quote tweets comprise 30% of all impressions, have a favorite rate of 6/30 = 20%, compared to a favorite rate of 10/100 = 10% on the total population of tweets. + +Therefore, being a quote tweet makes this specific user `123456789L` approximately twice as likely to favorite the tweet, which is useful for prediction and could result in the ML model giving higher scores to & ranking quote tweets higher in a personalized fashion for this user. + +Tests for Feature Names +-------------------------- +When you change or add AggregateGroup, feature names might change. And the Feature Store provides a testing mechanism to assert that the feature names change as you expect. See `tests for feature names `_. diff --git a/timelines/data_processing/ml_util/aggregation_framework/docs/conf.py b/timelines/data_processing/ml_util/aggregation_framework/docs/conf.py new file mode 100644 index 0000000000..03996dfd76 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/docs/conf.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- +# +# docbird documentation build configuration file +# Note that not all possible configuration values are present in this +# autogenerated file. +# + +from os.path import abspath, dirname, isfile, join + + +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.ifconfig", + "sphinx.ext.graphviz", + "twitter.docbird.ext.thriftlexer", + "twitter.docbird.ext.toctree_default_caption", + "sphinxcontrib.httpdomain", +] + + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# The suffix of source filenames. +source_suffix = ".rst" + +# The master toctree document. +master_doc = "index" + +# General information about the project. +project = u"""Aggregation Framework""" +description = u"""""" + +# The short X.Y version. +version = u"""1.0""" +# The full version, including alpha/beta/rc tags. +release = u"""1.0""" + +exclude_patterns = ["_build"] + +pygments_style = "sphinx" + +html_theme = "default" + +html_static_path = ["_static"] + +html_logo = u"""""" + +# Automagically add project logo, if it exists +# (checks on any build, not just init) +# Scan for some common defaults (png or svg format, +# called "logo" or project name, in docs folder) +if not html_logo: + location = dirname(abspath(__file__)) + for logo_file in ["logo.png", "logo.svg", ("%s.png" % project), ("%s.svg" % project)]: + html_logo = logo_file if isfile(join(location, logo_file)) else html_logo + +graphviz_output_format = "svg" diff --git a/timelines/data_processing/ml_util/aggregation_framework/docs/index.rst b/timelines/data_processing/ml_util/aggregation_framework/docs/index.rst new file mode 100644 index 0000000000..af703c6889 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/docs/index.rst @@ -0,0 +1,11 @@ +.. markdowninclude:: ../README.md + +.. toctree:: + :maxdepth: 2 + :hidden: + + aggregation + batch + real-time + joining + troubleshooting diff --git a/timelines/data_processing/ml_util/aggregation_framework/docs/joining.rst b/timelines/data_processing/ml_util/aggregation_framework/docs/joining.rst new file mode 100644 index 0000000000..2ecdf7612f --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/docs/joining.rst @@ -0,0 +1,72 @@ +.. _joining: + +Joining aggregates features to records +====================================== + +After setting up either offline batch jobs or online real-time summingbird jobs to produce +aggregate features and querying them, we are left with data records containing aggregate features. +This page will go over how to join them with other data records to produce offline training data. + +(To discuss: joining aggregates to records online) + +Joining Aggregates on Discrete/String Keys +------------------------------------------ + +Joining aggregate features keyed on discrete or text features to your training data is very easy - +you can use the built in methods provided by `DataSetPipe`. For example, suppose you have aggregates +keyed by `(USER_ID, AUTHOR_ID)`: + +.. code-block:: scala + + val userAuthorAggregates: DataSetPipe = AggregatesV2FeatureSource( + rootPath = “/path/to/my/aggregates”, + storeName = “user_author_aggregates”, + aggregates = MyConfig.aggregatesToCompute, + trimThreshold = 0 + )(dateRange).read + +Offline, you can then join with your training data set as follows: + +.. code-block:: scala + + val myTrainingData: DataSetPipe = ... + val joinedData = myTrainingData.joinWithLarger((USER_ID, AUTHOR_ID), userAuthorAggregates) + +You can read from `AggregatesV2MostRecentFeatureSourceBeforeDate` in order to read the most recent aggregates +before a provided date `beforeDate`. Just note that `beforeDate` must be aligned with the date boundary so if +you’re passing in a `dateRange`, use `dateRange.end`). + +Joining Aggregates on Sparse Binary Keys +---------------------------------------- + +When joining on sparse binary keys, there can be multiple aggregate records to join to each training record in +your training data set. For example, suppose you have setup an aggregate group that is keyed on `(INTEREST_ID, AUTHOR_ID)` +capturing engagement counts of users interested in a particular `INTEREST_ID` for specific authors provided by `AUTHOR_ID`. + +Suppose now that you have a training data record representing a specific user action. This training data record contains +a sparse binary feature `INTEREST_IDS` representing all the "interests" of that user - e.g. music, sports, and so on. Each `interest_id` +translates to a different set of counting features found in your aggregates data. Therefore we need a way to merge all of +these different sets of counting features to produce a more compact, fixed-size set of features. + +.. admonition:: Merge policies + + To do this, the aggregate framework provides a trait `SparseBinaryMergePolicy `_. Classes overriding this trait define policies + that state how to merge the individual aggregate features from each sparse binary value (in this case, each `INTEREST_ID` for a user). + Furthermore, we provide `SparseBinaryMultipleAggregateJoin` which executes these policies to merge aggregates. + +A simple policy might simply average all the counts from the individual interests, or just take the max, or +a specific quantile. More advanced policies might use custom criteria to decide which interest is most relevant and choose +features from that interest to represent the user, or use some weighted combination of counts. + +The framework provides two simple in-built policies (`PickTopCtrPolicy `_ +and `CombineCountsPolicy `_, which keeps the topK counts per +record) that you can get started with, though you likely want to implement your own policy based on domain knowledge to get +the best results for your specific problem domain. + +.. admonition:: Offline Code Example + + The scalding job `TrainingDataWithAggV2Generator `_ shows how multiple merge policies are defined and implemented to merge aggregates on sparse binary keys to the TQ's training data records. + +.. admonition:: Online Code Example + + In our (non-FeatureStore enabled) online code path, we merge aggregates on sparse binary keys using the `CombineCountsPolicy `_. diff --git a/timelines/data_processing/ml_util/aggregation_framework/docs/real-time.rst b/timelines/data_processing/ml_util/aggregation_framework/docs/real-time.rst new file mode 100644 index 0000000000..fc853ba699 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/docs/real-time.rst @@ -0,0 +1,327 @@ +.. _real_time: + +Real-Time aggregate features +============================ + +In addition to computing batch aggregate features, the aggregation framework supports real-time aggregates as well. The framework concepts used here are identical to the batch use case, however, the underlying implementation differs and is provided by summingbird-storm jobs. + +RTA Runbook +----------- + +For operational details, please visit http://go/tqrealtimeaggregates. + +Prerequisites +------------- + +In order to start computing real-time aggregate features, the framework requires the following to be provided: + +* A backing memcached store that will hold the computed aggregate features. This is conceptually equivalent to the output HDFS store in the batch compute case. +* Implementation of `StormAggregateSource `_ that creates `DataRecords` with the necessary input features. This serves as the input to the aggregation operations. +* Definition of aggregate features by defining `AggregateGroup` in an implementation of `OnlineAggregationConfigTrait`. This is identical to the batch case. +* Job config file defining the backing memcached for feature storage and retrieval, and job-related parameters. + +We will now go through the details in setting up each required component. + +Memcached store +--------------- + +Real-time aggregates use Memcache as the backing cache to store and update aggregate features keys. Caches can be provisioned on `go/cacheboard `_. + +.. admonition:: Test and prod caches + + For development, it is sufficient to setup a test cache that your new job can query and write to. At the same time, a production cache request should also be submitted as these generally have significant lead times for provisioning. + +StormAggregateSource +-------------------- + +To enable aggregation of your features, we need to start with defining a `StormAggregateSource` that builds a `Producer[Storm, DataRecord]`. This summingbird producer generates `DataRecords` that contain the input features and labels that the real-time aggregate job will compute aggregate features on. Conceptually, this is equivalent to the input data set in the offline batch use case. + +.. admonition:: Example + + If you are planning to aggregate on client engagements, you would need to subscribe to the `ClientEvent` kafka stream and then convert each event to a `DataRecord` that contains the key and the engagement on which to aggregate. + +Typically, we would setup a julep filter for the relevant client events that we would like to aggregate on. This gives us a `Producer[Storm, LogEvent]` object which we then convert to `Producer[Storm, DataRecord]` with adapters that we wrote: + +.. code-block:: scala + + lazy val clientEventProducer: Producer[Storm, LogEvent] = + ClientEventSourceScrooge( + appId = AppId(jobConfig.appId), + topic = "julep_client_event_suggests", + resumeAtLastReadOffset = false + ).source.name("timelines_events") + + lazy val clientEventWithCachedFeaturesProducer: Producer[Storm, DataRecord] = clientEventProducer + .flatMap(mkDataRecords) + +Note that this way of composing the storm graph gives us flexiblity in how we can hydrate input features. If you would like to join more complex features to `DataRecord`, you can do so here with additional storm components which can implement cache queries. + +.. admonition:: Timelines Quality use case + + In Timelines Quality, we aggregate client engagements on `userId` or `tweetId` and implement + `TimelinesStormAggregateSource `_. We create + `Producer[Storm,LogEvent]` of Timelines engagements to which we apply `ClientLogEventAdapter `_ which converts the event to `DataRecord` containing `userId`, `tweetId`, `timestampFeature` of the engagement and the engagement label itself. + +.. admonition:: MagicRecs use case + + MagicRecs has a very similar setup for real-time aggregate features. In addition, they also implement a more complex cache query to fetch the user's history in the `StormAggregateSource` for each observed client engagement to hydrate a richer set of input `DataRecords`: + + .. code-block:: scala + + val userHistoryStoreService: Storm#Service[Long, History] = + Storm.service(UserHistoryReadableStore) + + val clientEventDataRecordProducer: Producer[Storm, DataRecord] = + magicRecsClientEventProducer + .flatMap { ... + (userId, logEvent) + }.leftJoin(userHistoryStoreService) + .flatMap { + case (_, (logEvent, history)) => + mkDataRecords(LogEventHistoryPair(logEvent, history)) + } + +.. admonition:: EmailRecs use case + + EmailRecs shares the same cache as MagicRecs. They combine notification scribe data with email history data to identify the particular item a user engaged with in an email: + + .. code-block:: scala + + val emailHistoryStoreService: Storm#Service[Long, History] = + Storm.service(EmailHistoryReadableStore) + + val emailEventDataRecordProducer: Producer[Storm, DataRecord] = + emailEventProducer + .flatMap { ... + (userId, logEvent) + }.leftJoin(emailHistoryStoreService) + .flatMap { + case (_, (scribe, history)) => + mkDataRecords(ScribeHistoryPair(scribe, history)) + } + + +Aggregation config +------------------ + +The real-time aggregation config is extended from `OnlineAggregationConfigTrait `_ and defines the features to aggregate and the backing memcached store to which they will be written. + +Setting up real-time aggregates follows the same rules as in the offline batch use case. The major difference here is that `inputSource` should point to the `StormAggregateSource` implementation that provides the `DataRecord` containing the engagements and core features on which to aggregate. In the offline case, this would have been an `OfflineAggregateSource` pointing to an offline source of daily records. + +Finally, `RealTimeAggregateStore` defines the backing memcache to be used and should be provided here as the `outputStore`. + +.. NOTE:: + + Please make sure to provide an `AggregateGroup` for both staging and production. The main difference should be the `outputStore` where features in either environment are read from and written to. You want to make sure that a staged real-time aggregates summingbird job is reading/writing only to the test memcache store and does not mutate the production store. + +Job config +---------- + +In addition to the aggregation config that defines the features to aggregate, the final piece we need to provide is a `RealTimeAggregatesJobConfig` that specificies job values such as `appId`, `teamName` and counts for the various topology components that define the capacity of the job (`Timelines example `_). + +Once you have the job config, implementing the storm job itself is easy and almost as concise as in the batch use case: + +.. code-block:: scala + + object TimelinesRealTimeAggregatesJob extends RealTimeAggregatesJobBase { + override lazy val statsReceiver = DefaultStatsReceiver.scope("timelines_real_time_aggregates") + override lazy val jobConfigs = TimelinesRealTimeAggregatesJobConfigs + override lazy val aggregatesToCompute = TimelinesOnlineAggregationConfig.AggregatesToCompute + } + +.. NOTE:: + There are some topology settings that are currently hard-coded. In particular, we enable `Config.TOPOLOGY_DROPTUPLES_UPON_BACKPRESSURE` to be true for added robustness. This may be made user-definable in the future. + +Steps to hydrate RTAs +-------------------- +1. Make the changes to RTAs and follow the steps for `Running the topology`. +2. Register the new RTAs to feature store. Sample phab: https://phabricator.twitter.biz/D718120 +3. Wire the features from feature store to TLX. This is usually done with the feature switch set to False. So it's just a code change and will not yet start hydrating the features yet. Merge the phab. Sample phab: https://phabricator.twitter.biz/D718424 +4. Now we hydrate the features to TLX gradually by doing it shard wise. For this, first create a PCM and then enable the hydration. Sample PCM: https://jira.twitter.biz/browse/PCM-147814 + +Running the topology +-------------------- +0. For phab that makes change to the topology (such as adding new ML features), before landing the phab, please create a PCM (`example `_) and deploy the change to devel topology first and then prod (atla and pdxa). Once it is confirmed that the prod topology can handle the change, the phab can be landed. +1. Go to https://ci.twitter.biz/job/tq-ci/build +2. In `commands` input + +.. code-block:: bash + + . src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/deploy_local.sh [devel|atla|pdxa] + +One can only deploy either `devel`, `atla` (prod atla), `pdxa` (prod pdxa) at a time. +For example, to deploy both pdxa and atla prod topologies, one needs to build/run the above steps twice, one with `pdxa` and the other with `atla`. + +The status and performance stats of the topology are found at `go/heron-ui `_. Here you can view whether the job is processing tuples, whether it is under any memory or backpressure and provides general observability. + +Finally, since we enable `Config.TOPOLOGY_DROPTUPLES_UPON_BACKPRESSURE` by default in the topology, we also need to monitor and alert on the number of dropped tuples. Since this is a job generating features a small fraction of dropped tuples is tolerable if that enables us to avoid backpressure that would hold up global computation in the entire graph. + +Hydrating Real-Time Aggregate Features +-------------------------------------- + +Once the job is up and running, the aggregate features will be accessible in the backing memcached store. To access these features and hydrate to your online pipeline, we need to build a Memcache client with the right query key. + +.. admonition:: Example + + Some care needs to be taken to define the key injection and codec correctly for the memcached store. These types do not change and you can use the Timelines `memcache client builder `_ as an example. + +Aggregate features are written to store with a `(AggregationKey, BatchID)` key. + +`AggregationKey `_ is an instant of the keys that you previously defined in `AggregateGroup`. If your aggregation key is `USER_ID`, you would need to instantiate `AggregationKey` with the `USER_ID` featureId and the userId value. + +.. admonition:: Returned features + + The `DataRecord` that is returned by the cache now contains all real-time aggregate features for the query `AggregationKey` (similar to the batch use case). If your online hydration flow produces data records, the real-time aggregate features can be joined with your existing records in a straightforward way. + +Adding features from Feature Store to RTA +-------------------------------------------- +To add features from Feature Store to RTA and create real time aggregated features based on them, one needs to follow these steps: + +**Step 1** + +Copy Strato column for features that one wants to explore and add a cache if needed. See details at `Customize any Columns for your Team as Needed `_. As an `example `_, we copy Strato column of recommendationsUserFeaturesProd.User.strato and add a cache for timelines team's usage. + +**Step 2** + +Create a new ReadableStore which uses Feature Store Client to request features from Feature Store. Implement FeaturesAdapter which extends TimelinesAdapterBase and derive new features based on raw features from Feature Store. As an `example `_, we create UserFeaturesReadableStore which reads discrete feature user state, and convert it to a list of boolean user state features. + +**Step 3** + +Join these derived features from Feature Store to timelines storm aggregate source. Depends on the characteristic of these derived features, joined key could be tweet id, user id or others. As an `example `_, because user state is per user, the joined key is user id. + +**Step 4** + +Define `AggregateGroup` based on derived features in RTA + +Adding New Aggregate Features from an Existing Dataset +-------------------------------- +To add a new aggregate feature group from an existing dataset for use in home models, use the following steps: + +1. Identify the hypothesis being tested by the addition of the features, in accordance with `go/tpfeatureguide `_. +2. Modify or add a new AggregateGroup to `TimelinesOnlineAggregationConfigBase.scala `_ to define the aggregation key, set of features, labels and metrics. An example phab to add more halflives can be found at `D204415 `_. +3. If the change is expected to be very large, it may be recommended to perform capacity estimation. See :ref:`Capacity Estimation` for more details. +4. Create feature catalog items for the new RTAs. An example phab is `D706348 `_. For approval from a featurestore owner ping #help-ml-features on slack. +5. Add new features to the featurestore. An example phab is `D706112 `_. This change can be rolled out with feature switches or by canarying TLX, depending on the risk. An example PCM for feature switches is: `PCM-148654 `_. An example PCM for canarying is: `PCM-145753 `_. +6. Wait for redeploy and confirm the new features are available. One way is querying in BigQuery from a table like `twitter-bq-timelines-prod.continuous_training_recap_fav`. Another way is to inspect individual records using pcat. The command to be used is like: + +.. code-block:: bash + + java -cp pcat-deploy.jar:$(hadoop classpath) com.twitter.ml.tool.pcat.PredictionCatTool + -path /atla/proc2/user/timelines/processed/suggests/recap/continuous_training_data_records/fav/data/YYYY/MM/DD/01/part-00000.lzo + -fc /atla/proc2/user/timelines/processed/suggests/recap/continuous_training_data_records/fav/data_spec.json + -dates YYYY-MM-DDT01 -record_limit 100 | grep [feature_group] + + +7. Create a phab with the new features and test the performance of a model with them compared to a control model without them. Test offline using `Deepbird for training `_ and `RCE Hypothesis Testing `_ to test. Test online using a DDG. Some helpful instructions are available in `Serving Timelines Models `_ and the `Experiment Cookbook `_ + +Capacity Estimation +-------------------------------- +This section describes how to approximate the capacity required for a new aggregate group. It is not expected to be exact, but should give a rough estimate. + +There are two main components that must be stored for each aggregate group. + +Key space: Each AggregationKey struct consists of two maps, one of which is populated with tuples [Long, Long] representing of discrete features. This takes up 4 x 8 bytes or 32 bytes. The cache team estimates an additional 40 bytes of overhead. + +Features: An aggregate feature is represented as a pair (16 bytes) and is produced for each feature x label x metric x halflife combination. + +1. Use bigquery to estimate how many unique values exist for the selected key (key_count). Also collect the number of features, labels, metrics, and half-lives being used. +2. Compute the number of entries to be created, which is num_entires = feature_count * label_count * metric_count * halflife_count +3. Compute the number of bytes per entry, which is num_entry_bytes = 16*num_entries + 32 bytes (key storage) + 40 bytes (overhead) +4. Compute total space required = num_entry_bytes * key_count + +Debugging New Aggregate Features +-------------------------------- + +To debug problems in the setup of your job, there are several steps you can take. + +First, ensure that data is being received from the input stream and passed through to create data records. This can be achieved by logging results at various places in your code, and especially at the point of data record creation. + +For example, suppose you want to ensure that a data record is being created with +the features you expect. With push and email features, we find that data records +are created in the adaptor, using logic like the following: + +.. code-block:: scala + + val record = new SRichDataRecord(new DataRecord) + ... + record.setFeatureValue(feature, value) + +To see what these feature values look like, we can have our adaptor class extend +Twitter's `Logging` trait, and write each created record to a log file. + +.. code-block:: scala + + class MyEventAdaptor extends TimelinesAdapterBase[MyObject] with Logging { + ... + ... + def mkDataRecord(myFeatures: MyFeatures): DataRecord = { + val record = new SRichDataRecord(new DataRecord) + ... + record.setFeatureValue(feature, value) + logger.info("data record xyz: " + record.getRecord.toString) + } + +This way, every time a data record is sent to the aggregator, it will also be +logged. To inspect these logs, you can push these changes to a staging instance, +ssh into that aurora instance, and grep the `log-files` directory for `xyz`. The +data record objects you find should resemble a map from feature ids to their +values. + +To check that steps in the aggregation are being performed, you can also inspect the job's topology on go/heronui. + +Lastly, to verify that values are being written to your cache you can check the `set` chart in your cache's viz. + +To check particular feature values for a given key, you can spin up a Scala REPL like so: + +.. code-block:: bash + + $ ssh -fN -L*:2181:sdzookeeper-read.atla.twitter.com:2181 -D *:50001 nest.atlc.twitter.com + + $ ./pants repl --jvm-repl-scala-options='-DsocksProxyHost=localhost -DsocksProxyPort=50001 -Dcom.twitter.server.resolverZkHosts=localhost:2181' timelinemixer/common/src/main/scala/com/twitter/timelinemixer/clients/real_time_aggregates_cache + +You will then need to create a connection to the cache, and a key with which to query it. + +.. code-block:: scala + + import com.twitter.conversions.DurationOps._ + import com.twitter.finagle.stats.{DefaultStatsReceiver, StatsReceiver} + import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey + import com.twitter.summingbird.batch.Batcher + import com.twitter.timelinemixer.clients.real_time_aggregates_cache.RealTimeAggregatesMemcacheBuilder + import com.twitter.timelines.clients.memcache_common.StorehausMemcacheConfig + + val userFeature = -1887718638306251279L // feature id corresponding to User feature + val userId = 12L // replace with a user id logged when creating your data record + val key = (AggregationKey(Map(userFeature -> userId), Map.empty), Batcher.unit.currentBatch) + + val dataset = "twemcache_magicrecs_real_time_aggregates_cache_staging" // replace with the appropriate cache name + val dest = s"/srv#/test/local/cache/twemcache_/$dataset" + + val statsReceiver: StatsReceiver = DefaultStatsReceiver + val cache = new RealTimeAggregatesMemcacheBuilder( + config = StorehausMemcacheConfig( + destName = dest, + keyPrefix = "", + requestTimeout = 10.seconds, + numTries = 1, + globalTimeout = 10.seconds, + tcpConnectTimeout = 10.seconds, + connectionAcquisitionTimeout = 10.seconds, + numPendingRequests = 250, + isReadOnly = true + ), + statsReceiver.scope(dataset) + ).build + + val result = cache.get(key) + +Another option is to create a debugger which points to the staging cache and creates a cache connection and key similar to the logic above. + +Run CQL query to find metrics/counters +-------------------------------- +We can also visualize the counters from our job to verify new features. Run CQL query on terminal to find the right path of metrics/counters. For example, in order to check counter mergeNumFeatures, run: + +cql -z atla keys heron/summingbird_timelines_real_time_aggregates Tail-FlatMap | grep mergeNumFeatures + + +Then use the right path to create the viz, example: https://monitoring.twitter.biz/tiny/2552105 diff --git a/timelines/data_processing/ml_util/aggregation_framework/docs/troubleshooting.rst b/timelines/data_processing/ml_util/aggregation_framework/docs/troubleshooting.rst new file mode 100644 index 0000000000..d9799f4337 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/docs/troubleshooting.rst @@ -0,0 +1,117 @@ +.. _troubleshooting: + +TroubleShooting +================== + + +[Batch] Regenerating a corrupt version +-------------------------------------- + +Symptom +~~~~~~~~~~ +The Summingbird batch job failed due to the following error: + +.. code:: bash + + Caused by: com.twitter.bijection.InversionFailure: ... + +It typically indicates the corrupt records of the aggregate store (not the other side of the DataRecord source). +The following describes the method to re-generate the required (typically the latest) version: + +Solution +~~~~~~~~~~ +1. Copy **the second to last version** of the problematic data to canaries folder. For example, if 11/20's job keeps failing, then copy the 11/19's data. + +.. code:: bash + + $ hadoop --config /etc/hadoop/hadoop-conf-proc2-atla/ \ + distcp -m 1000 \ + /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates/1605744000000 \ + /atla/proc2/user/timelines/canaries/processed/aggregates_v2/user_mention_aggregates/1605744000000 + + +2. Setup canary run for the date of the problem with fallback path pointing to `1605744000000` in the prod/canaries folder. + +3. Deschedule the production job and kill the current run: + +For example, + +.. code:: bash + + $ aurora cron deschedule atla/timelines/prod/user_mention_aggregates + $ aurora job killall atla/timelines/prod/user_mention_aggregates + +4. Create backup folder and move the corrupt prod store output there + +.. code:: bash + + $ hdfs dfs -mkdir /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates_backup + $ hdfs dfs -mv /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates/1605830400000 /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates_backup/ + $ hadoop fs -count /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates_backup/1605830400000 + + 1 1001 10829136677614 /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates_backup/1605830400000 + + +5. Copy canary output store to prod folder: + +.. code:: bash + + $ hadoop --config /etc/hadoop/hadoop-conf-proc2-atla/ distcp -m 1000 /atla/proc2/user/timelines/canaries/processed/aggregates_v2/user_mention_aggregates/1605830400000 /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates/1605830400000 + +We can see the slight difference of size: + +.. code:: bash + + $ hadoop fs -count /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates_backup/1605830400000 + 1 1001 10829136677614 /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates_backup/1605830400000 + $ hadoop fs -count /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates/1605830400000 + 1 1001 10829136677844 /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates/1605830400000 + +6. Deploy prod job again and observe whether it can successfully process the new output for the date of interest. + +7. Verify the new run succeeded and job is unblocked. + +Example +~~~~~~~~ + +There is an example in https://phabricator.twitter.biz/D591174 + + +[Batch] Skipping the offline job ahead +--------------------------------------- + +Symptom +~~~~~~~~~~ +The Summingbird batch job keeps failing and the DataRecord source is no longer available (e.g. due to retention) and there is no way for the job succeed **OR** + +.. +The job is stuck processing old data (more than one week old) and it will not catch up to the new data on its own if it is left alone + +Solution +~~~~~~~~ + +We will need to skip the job ahead. Unfortunately, this involves manual effort. We also need help from the ADP team (Slack #adp). + +1. Ask the ADP team to manually insert an entry into the store via the #adp Slack channel. You may refer to https://jira.twitter.biz/browse/AIPIPE-7520 and https://jira.twitter.biz/browse/AIPIPE-9300 as references. However, please don't create and assign tickets directly to an ADP team member unless they ask you to. + +2. Copy the latest version of the store to the same HDFS directory but with a different destination name. The name MUST be the same as the above inserted version. + +For example, if the ADP team manually inserted a version on 12/09/2020, then we can see the version by running + +.. code:: bash + + $ dalv2 segment list --name user_original_author_aggregates --role timelines --location-name proc2-atla --location-type hadoop-cluster + ... + None 2020-12-09T00:00:00Z viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_original_author_aggregates/1607472000000 Unknown None + +where `1607472000000` is the timestamp of 12/09/2020. +Then you will need to duplicate the latest version of the store to a dir of `1607472000000`. +For example, + +.. code:: bash + + $ hadoop --config /etc/hadoop/hadoop-conf-proc2-atla/ distcp -m 1000 /atla/proc2/user/timelines/processed/aggregates_v2/user_original_author_aggregates/1605052800000 /atla/proc2/user/timelines/processed/aggregates_v2/user_original_author_aggregates/1607472000000 + +3. Go to the EagleEye UI of the job and click on the "Skip Ahead" button to the desired datetime. In our example, it should be `2020-12-09 12am` + +4. Wait for the job to start. Now the job should be running the 2020-12-09 partition. diff --git a/timelines/data_processing/ml_util/aggregation_framework/heron/BUILD b/timelines/data_processing/ml_util/aggregation_framework/heron/BUILD new file mode 100644 index 0000000000..0cc576e4e6 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/heron/BUILD @@ -0,0 +1,74 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + strict_deps = False, + tags = ["bazel-compatible"], + dependencies = [ + ":configs", + "3rdparty/jvm/storm:heron-oss-storm", + "3rdparty/src/jvm/com/twitter/scalding:args", + "3rdparty/src/jvm/com/twitter/summingbird:storm", + "src/java/com/twitter/heron/util", + "src/java/com/twitter/ml", + "src/scala/com/twitter/storehaus_internal/nighthawk_kv", + "src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits", + "src/scala/com/twitter/summingbird_internal/runner/common", + "src/scala/com/twitter/summingbird_internal/runner/storm", + "src/scala/com/twitter/timelines/prediction/features/common", + "timelines/data_processing/ml_util/aggregation_framework:user_job", + ], +) + +scala_library( + name = "configs", + sources = [ + "NighthawkUnderlyingStoreConfig.scala", + "OnlineAggregationConfigTrait.scala", + "OnlineAggregationStoresTrait.scala", + "RealTimeAggregateStore.scala", + "RealTimeAggregatesJobConfig.scala", + "StormAggregateSource.scala", + ], + platform = "java8", + strict_deps = True, + tags = ["bazel-compatible"], + dependencies = [ + ":base-config", + "3rdparty/jvm/storm:heron-oss-storm", + "3rdparty/src/jvm/com/twitter/summingbird:core", + "3rdparty/src/jvm/com/twitter/summingbird:storm", + "finagle/finagle-core/src/main", + "src/java/com/twitter/ml/api:api-base", + "src/scala/com/twitter/storehaus_internal/memcache", + "src/scala/com/twitter/storehaus_internal/memcache/config", + "src/scala/com/twitter/storehaus_internal/nighthawk_kv", + "src/scala/com/twitter/storehaus_internal/nighthawk_kv/config", + "src/scala/com/twitter/storehaus_internal/online", + "src/scala/com/twitter/storehaus_internal/store", + "src/scala/com/twitter/storehaus_internal/util", + "src/scala/com/twitter/summingbird_internal/runner/store_config", + "src/thrift/com/twitter/clientapp/gen:clientapp-java", + "src/thrift/com/twitter/ml/api:data-java", + "src/thrift/com/twitter/ml/api:data-scala", + "src/thrift/com/twitter/ml/api:feature_context-java", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + "timelines/data_processing/ml_util/transforms", + "util/util-core:scala", + "util/util-core:util-core-util", + "util/util-stats/src/main/scala/com/twitter/finagle/stats", + ], +) + +scala_library( + name = "base-config", + sources = [ + "OnlineAggregationConfigTrait.scala", + ], + platform = "java8", + strict_deps = True, + tags = ["bazel-compatible"], + dependencies = [ + "src/java/com/twitter/ml/api:api-base", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + ], +) diff --git a/timelines/data_processing/ml_util/aggregation_framework/heron/NighthawkUnderlyingStoreConfig.scala b/timelines/data_processing/ml_util/aggregation_framework/heron/NighthawkUnderlyingStoreConfig.scala new file mode 100644 index 0000000000..cf7668a207 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/heron/NighthawkUnderlyingStoreConfig.scala @@ -0,0 +1,31 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron + +import com.twitter.conversions.DurationOps._ +import com.twitter.finagle.mtls.authentication.EmptyServiceIdentifier +import com.twitter.finagle.mtls.authentication.ServiceIdentifier +import com.twitter.finagle.ssl.OpportunisticTls +import com.twitter.storehaus_internal.nighthawk_kv.CacheClientNighthawkConfig +import com.twitter.storehaus_internal.util.TTL +import com.twitter.storehaus_internal.util.TableName +import com.twitter.summingbird_internal.runner.store_config.OnlineStoreOnlyConfig +import com.twitter.util.Duration + +case class NighthawkUnderlyingStoreConfig( + serversetPath: String = "", + tableName: String = "", + cacheTTL: Duration = 1.day) + extends OnlineStoreOnlyConfig[CacheClientNighthawkConfig] { + + def online: CacheClientNighthawkConfig = online(EmptyServiceIdentifier) + + def online( + serviceIdentifier: ServiceIdentifier = EmptyServiceIdentifier + ): CacheClientNighthawkConfig = + CacheClientNighthawkConfig( + serversetPath, + TableName(tableName), + TTL(cacheTTL), + serviceIdentifier = serviceIdentifier, + opportunisticTlsLevel = OpportunisticTls.Required + ) +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/heron/OnlineAggregationConfigTrait.scala b/timelines/data_processing/ml_util/aggregation_framework/heron/OnlineAggregationConfigTrait.scala new file mode 100644 index 0000000000..aea6491286 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/heron/OnlineAggregationConfigTrait.scala @@ -0,0 +1,28 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron + +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup +import com.twitter.ml.api.Feature + +trait OnlineAggregationConfigTrait { + def ProdAggregates: Set[TypedAggregateGroup[_]] + def StagingAggregates: Set[TypedAggregateGroup[_]] + def ProdCommonAggregates: Set[TypedAggregateGroup[_]] + + /** + * AggregateToCompute: This defines the complete set of aggregates to be + * computed by the aggregation job and to be stored in memcache. + */ + def AggregatesToCompute: Set[TypedAggregateGroup[_]] + + /** + * ProdFeatures: This defines the subset of aggregates to be extracted + * and hydrated (or adapted) by callers to the aggregates features cache. + * This should only contain production aggregates and aggregates on + * product specific engagements. + * ProdCommonFeatures: Similar to ProdFeatures but containing user-level + * aggregate features. This is provided to PredictionService just + * once per user. + */ + lazy val ProdFeatures: Set[Feature[_]] = ProdAggregates.flatMap(_.allOutputFeatures) + lazy val ProdCommonFeatures: Set[Feature[_]] = ProdCommonAggregates.flatMap(_.allOutputFeatures) +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/heron/OnlineAggregationStoresTrait.scala b/timelines/data_processing/ml_util/aggregation_framework/heron/OnlineAggregationStoresTrait.scala new file mode 100644 index 0000000000..4f693190e6 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/heron/OnlineAggregationStoresTrait.scala @@ -0,0 +1,6 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron + +trait OnlineAggregationStoresTrait { + def ProductionStore: RealTimeAggregateStore + def StagingStore: RealTimeAggregateStore +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/heron/RealTimeAggregateStore.scala b/timelines/data_processing/ml_util/aggregation_framework/heron/RealTimeAggregateStore.scala new file mode 100644 index 0000000000..2e75039d34 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/heron/RealTimeAggregateStore.scala @@ -0,0 +1,50 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron + +import com.twitter.conversions.DurationOps._ +import com.twitter.finagle.mtls.authentication.EmptyServiceIdentifier +import com.twitter.finagle.mtls.authentication.ServiceIdentifier +import com.twitter.storehaus_internal.memcache.ConnectionConfig +import com.twitter.storehaus_internal.memcache.MemcacheConfig +import com.twitter.storehaus_internal.util.KeyPrefix +import com.twitter.storehaus_internal.util.TTL +import com.twitter.storehaus_internal.util.ZkEndPoint +import com.twitter.summingbird_internal.runner.store_config.OnlineStoreOnlyConfig +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregateStore +import com.twitter.util.Duration + +object RealTimeAggregateStore { + val twCacheWilyPrefix = "/srv#" // s2s is only supported for wily path + + def makeEndpoint( + memcacheDataSet: String, + isProd: Boolean, + twCacheWilyPrefix: String = twCacheWilyPrefix + ): String = { + val env = if (isProd) "prod" else "test" + s"$twCacheWilyPrefix/$env/local/cache/$memcacheDataSet" + } +} + +case class RealTimeAggregateStore( + memcacheDataSet: String, + isProd: Boolean = false, + cacheTTL: Duration = 1.day) + extends OnlineStoreOnlyConfig[MemcacheConfig] + with AggregateStore { + import RealTimeAggregateStore._ + + override val name: String = "" + val storeKeyPrefix: KeyPrefix = KeyPrefix(name) + val memcacheZkEndPoint: String = makeEndpoint(memcacheDataSet, isProd) + + def online: MemcacheConfig = online(serviceIdentifier = EmptyServiceIdentifier) + + def online(serviceIdentifier: ServiceIdentifier = EmptyServiceIdentifier): MemcacheConfig = + new MemcacheConfig { + val endpoint = ZkEndPoint(memcacheZkEndPoint) + override val connectionConfig = + ConnectionConfig(endpoint, serviceIdentifier = serviceIdentifier) + override val keyPrefix = storeKeyPrefix + override val ttl = TTL(Duration.fromMilliseconds(cacheTTL.inMillis)) + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/heron/RealTimeAggregatesJobBase.scala b/timelines/data_processing/ml_util/aggregation_framework/heron/RealTimeAggregatesJobBase.scala new file mode 100644 index 0000000000..906f7c1be9 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/heron/RealTimeAggregatesJobBase.scala @@ -0,0 +1,301 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron + +import com.twitter.algebird.Monoid +import com.twitter.bijection.Injection +import com.twitter.bijection.thrift.CompactThriftCodec +import com.twitter.conversions.DurationOps._ +import com.twitter.finagle.mtls.authentication.EmptyServiceIdentifier +import com.twitter.finagle.mtls.authentication.ServiceIdentifier +import com.twitter.finagle.stats.StatsReceiver +import com.twitter.heron.util.CommonMetric +import com.twitter.ml.api.DataRecord +import com.twitter.scalding.Args +import com.twitter.storehaus.algebra.MergeableStore +import com.twitter.storehaus.algebra.StoreAlgebra._ +import com.twitter.storehaus_internal.memcache.Memcache +import com.twitter.storehaus_internal.store.CombinedStore +import com.twitter.storehaus_internal.store.ReplicatingWritableStore +import com.twitter.summingbird.batch.BatchID +import com.twitter.summingbird.batch.Batcher +import com.twitter.summingbird.online.MergeableStoreFactory +import com.twitter.summingbird.online.option._ +import com.twitter.summingbird.option.CacheSize +import com.twitter.summingbird.option.JobId +import com.twitter.summingbird.storm.option.FlatMapStormMetrics +import com.twitter.summingbird.storm.option.SummerStormMetrics +import com.twitter.summingbird.storm.Storm +import com.twitter.summingbird.storm.StormMetric +import com.twitter.summingbird.Options +import com.twitter.summingbird._ +import com.twitter.summingbird_internal.runner.common.CapTicket +import com.twitter.summingbird_internal.runner.common.JobName +import com.twitter.summingbird_internal.runner.common.TeamEmail +import com.twitter.summingbird_internal.runner.common.TeamName +import com.twitter.summingbird_internal.runner.storm.ProductionStormConfig +import com.twitter.timelines.data_processing.ml_util.aggregation_framework._ +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.job.AggregatesV2Job +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.job.AggregatesV2Job +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.job.DataRecordFeatureCounter +import org.apache.heron.api.{Config => HeronConfig} +import org.apache.heron.common.basics.ByteAmount +import org.apache.storm.Config +import scala.collection.JavaConverters._ + +object RealTimeAggregatesJobBase { + lazy val commonMetric: StormMetric[CommonMetric] = + StormMetric(new CommonMetric(), CommonMetric.NAME, CommonMetric.POLL_INTERVAL) + lazy val flatMapMetrics: FlatMapStormMetrics = FlatMapStormMetrics(Iterable(commonMetric)) + lazy val summerMetrics: SummerStormMetrics = SummerStormMetrics(Iterable(commonMetric)) +} + +trait RealTimeAggregatesJobBase extends Serializable { + import RealTimeAggregatesJobBase._ + import com.twitter.summingbird_internal.bijection.BatchPairImplicits._ + + def statsReceiver: StatsReceiver + + def aggregatesToCompute: Set[TypedAggregateGroup[_]] + + def jobConfigs: RealTimeAggregatesJobConfigs + + implicit lazy val dataRecordCodec: Injection[DataRecord, Array[Byte]] = + CompactThriftCodec[DataRecord] + implicit lazy val monoid: Monoid[DataRecord] = DataRecordAggregationMonoid(aggregatesToCompute) + implicit lazy val aggregationKeyInjection: Injection[AggregationKey, Array[Byte]] = + AggregationKeyInjection + + val clusters: Set[String] = Set("atla", "pdxa") + + def buildAggregateStoreToStorm( + isProd: Boolean, + serviceIdentifier: ServiceIdentifier, + jobConfig: RealTimeAggregatesJobConfig + ): (AggregateStore => Option[Storm#Store[AggregationKey, DataRecord]]) = { + (store: AggregateStore) => + store match { + case rtaStore: RealTimeAggregateStore if rtaStore.isProd == isProd => { + lazy val primaryStore: MergeableStore[(AggregationKey, BatchID), DataRecord] = + Memcache.getMemcacheStore[(AggregationKey, BatchID), DataRecord]( + rtaStore.online(serviceIdentifier)) + + lazy val mergeableStore: MergeableStore[(AggregationKey, BatchID), DataRecord] = + if (jobConfig.enableUserReindexingNighthawkBtreeStore + || jobConfig.enableUserReindexingNighthawkHashStore) { + val reindexingNighthawkBtreeWritableDataRecordStoreList = + if (jobConfig.enableUserReindexingNighthawkBtreeStore) { + lazy val cacheClientNighthawkConfig = + jobConfig.userReindexingNighthawkBtreeStoreConfig.online(serviceIdentifier) + List( + UserReindexingNighthawkWritableDataRecordStore.getBtreeStore( + nighthawkCacheConfig = cacheClientNighthawkConfig, + // Choose a reasonably large target size as this will be equivalent to the number of unique (user, timestamp) + // keys that are returned on read on the pKey, and we may have duplicate authors and associated records. + targetSize = 512, + statsReceiver = statsReceiver, + // Assuming trims are relatively expensive, choose a trimRate that's not as aggressive. In this case we trim on + // 10% of all writes. + trimRate = 0.1 + )) + } else { Nil } + val reindexingNighthawkHashWritableDataRecordStoreList = + if (jobConfig.enableUserReindexingNighthawkHashStore) { + lazy val cacheClientNighthawkConfig = + jobConfig.userReindexingNighthawkHashStoreConfig.online(serviceIdentifier) + List( + UserReindexingNighthawkWritableDataRecordStore.getHashStore( + nighthawkCacheConfig = cacheClientNighthawkConfig, + // Choose a reasonably large target size as this will be equivalent to the number of unique (user, timestamp) + // keys that are returned on read on the pKey, and we may have duplicate authors and associated records. + targetSize = 512, + statsReceiver = statsReceiver, + // Assuming trims are relatively expensive, choose a trimRate that's not as aggressive. In this case we trim on + // 10% of all writes. + trimRate = 0.1 + )) + } else { Nil } + + lazy val replicatingWritableStore = new ReplicatingWritableStore( + stores = List(primaryStore) ++ reindexingNighthawkBtreeWritableDataRecordStoreList + ++ reindexingNighthawkHashWritableDataRecordStoreList + ) + + lazy val combinedStoreWithReindexing = new CombinedStore( + read = primaryStore, + write = replicatingWritableStore + ) + + combinedStoreWithReindexing.toMergeable + } else { + primaryStore + } + + lazy val storeFactory: MergeableStoreFactory[(AggregationKey, BatchID), DataRecord] = + Storm.store(mergeableStore)(Batcher.unit) + Some(storeFactory) + } + case _ => None + } + } + + def buildDataRecordSourceToStorm( + jobConfig: RealTimeAggregatesJobConfig + ): (AggregateSource => Option[Producer[Storm, DataRecord]]) = { (source: AggregateSource) => + { + source match { + case stormAggregateSource: StormAggregateSource => + Some(stormAggregateSource.build(statsReceiver, jobConfig)) + case _ => None + } + } + } + + def apply(args: Args): ProductionStormConfig = { + lazy val isProd = args.boolean("production") + lazy val cluster = args.getOrElse("cluster", "") + lazy val isDebug = args.boolean("debug") + lazy val role = args.getOrElse("role", "") + lazy val service = + args.getOrElse( + "service_name", + "" + ) // don't use the argument service, which is a reserved heron argument + lazy val environment = if (isProd) "prod" else "devel" + lazy val s2sEnabled = args.boolean("s2s") + lazy val keyedByUserEnabled = args.boolean("keyed_by_user") + lazy val keyedByAuthorEnabled = args.boolean("keyed_by_author") + + require(clusters.contains(cluster)) + if (s2sEnabled) { + require(role.length() > 0) + require(service.length() > 0) + } + + lazy val serviceIdentifier = if (s2sEnabled) { + ServiceIdentifier( + role = role, + service = service, + environment = environment, + zone = cluster + ) + } else EmptyServiceIdentifier + + lazy val jobConfig = { + val jobConfig = if (isProd) jobConfigs.Prod else jobConfigs.Devel + jobConfig.copy( + serviceIdentifier = serviceIdentifier, + keyedByUserEnabled = keyedByUserEnabled, + keyedByAuthorEnabled = keyedByAuthorEnabled) + } + + lazy val dataRecordSourceToStorm = buildDataRecordSourceToStorm(jobConfig) + lazy val aggregateStoreToStorm = + buildAggregateStoreToStorm(isProd, serviceIdentifier, jobConfig) + + lazy val JaasConfigFlag = "-Djava.security.auth.login.config=resources/jaas.conf" + lazy val JaasDebugFlag = "-Dsun.security.krb5.debug=true" + lazy val JaasConfigString = + if (isDebug) { "%s %s".format(JaasConfigFlag, JaasDebugFlag) } + else JaasConfigFlag + + new ProductionStormConfig { + implicit val jobId: JobId = JobId(jobConfig.name) + override val jobName = JobName(jobConfig.name) + override val teamName = TeamName(jobConfig.teamName) + override val teamEmail = TeamEmail(jobConfig.teamEmail) + override val capTicket = CapTicket("n/a") + + val configureHeronJvmSettings = { + val heronJvmOptions = new java.util.HashMap[String, AnyRef]() + jobConfig.componentToRamGigaBytesMap.foreach { + case (component, gigabytes) => + HeronConfig.setComponentRam( + heronJvmOptions, + component, + ByteAmount.fromGigabytes(gigabytes)) + } + + HeronConfig.setContainerRamRequested( + heronJvmOptions, + ByteAmount.fromGigabytes(jobConfig.containerRamGigaBytes) + ) + + jobConfig.componentsToKerberize.foreach { component => + HeronConfig.setComponentJvmOptions( + heronJvmOptions, + component, + JaasConfigString + ) + } + + jobConfig.componentToMetaSpaceSizeMap.foreach { + case (component, metaspaceSize) => + HeronConfig.setComponentJvmOptions( + heronJvmOptions, + component, + metaspaceSize + ) + } + + heronJvmOptions.asScala.toMap ++ AggregatesV2Job + .aggregateNames(aggregatesToCompute).map { + case (prefix, aggNames) => (s"extras.aggregateNames.${prefix}", aggNames) + } + } + + override def transformConfig(m: Map[String, AnyRef]): Map[String, AnyRef] = { + super.transformConfig(m) ++ List( + /** + * Disable acking by setting acker executors to 0. Tuples that come off the + * spout will be immediately acked which effectively disables retries on tuple + * failures. This should help topology throughput/availability by relaxing consistency. + */ + Config.TOPOLOGY_ACKER_EXECUTORS -> int2Integer(0), + Config.TOPOLOGY_WORKERS -> int2Integer(jobConfig.topologyWorkers), + HeronConfig.TOPOLOGY_CONTAINER_CPU_REQUESTED -> int2Integer(8), + HeronConfig.TOPOLOGY_DROPTUPLES_UPON_BACKPRESSURE -> java.lang.Boolean.valueOf(true), + HeronConfig.TOPOLOGY_WORKER_CHILDOPTS -> List( + JaasConfigString, + s"-Dcom.twitter.eventbus.client.zoneName=${cluster}", + "-Dcom.twitter.eventbus.client.EnableKafkaSaslTls=true" + ).mkString(" "), + "storm.job.uniqueId" -> jobId.get + ) ++ configureHeronJvmSettings + + } + + override lazy val getNamedOptions: Map[String, Options] = jobConfig.topologyNamedOptions ++ + Map( + "DEFAULT" -> Options() + .set(flatMapMetrics) + .set(summerMetrics) + .set(MaxWaitingFutures(1000)) + .set(FlushFrequency(30.seconds)) + .set(UseAsyncCache(true)) + .set(AsyncPoolSize(4)) + .set(SourceParallelism(jobConfig.sourceCount)) + .set(SummerBatchMultiplier(1000)), + "FLATMAP" -> Options() + .set(FlatMapParallelism(jobConfig.flatMapCount)) + .set(CacheSize(0)), + "SUMMER" -> Options() + .set(SummerParallelism(jobConfig.summerCount)) + /** + * Sets number of tuples a Summer awaits before aggregation. Set higher + * if you need to lower qps to memcache at the expense of introducing + * some (stable) latency. + */ + .set(CacheSize(jobConfig.cacheSize)) + ) + + val featureCounters: Seq[DataRecordFeatureCounter] = + Seq(DataRecordFeatureCounter.any(Counter(Group("feature_counter"), Name("num_records")))) + + override def graph: TailProducer[Storm, Any] = AggregatesV2Job.generateJobGraph[Storm]( + aggregateSet = aggregatesToCompute, + aggregateSourceToSummingbird = dataRecordSourceToStorm, + aggregateStoreToSummingbird = aggregateStoreToStorm, + featureCounters = featureCounters + ) + } + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/heron/RealTimeAggregatesJobConfig.scala b/timelines/data_processing/ml_util/aggregation_framework/heron/RealTimeAggregatesJobConfig.scala new file mode 100644 index 0000000000..8bed262645 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/heron/RealTimeAggregatesJobConfig.scala @@ -0,0 +1,79 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron + +import com.twitter.finagle.mtls.authentication.EmptyServiceIdentifier +import com.twitter.finagle.mtls.authentication.ServiceIdentifier +import com.twitter.ml.api.DataRecord +import com.twitter.summingbird.Options +import com.twitter.timelines.data_processing.ml_util.transforms.OneToSomeTransform + +/** + * + * @param appId application id for topology job + * @param topologyWorkers number of workers/containers of topology + * @param sourceCount number of parallel sprouts of topology + * @param summerCount number of Summer of topology + * @param cacheSize number of tuples a Summer awaits before aggregation. + * @param flatMapCount number of parallel FlatMap of topology + * @param containerRamGigaBytes total RAM of each worker/container has + * @param name name of topology job + * @param teamName name of team who owns topology job + * @param teamEmail email of team who owns topology job + * @param componentsToKerberize component of topology job (eg. Tail-FlatMap-Source) which enables kerberization + * @param componentToMetaSpaceSizeMap MetaSpaceSize settings for components of topology job + * @param topologyNamedOptions Sets spout allocations for named topology components + * @param serviceIdentifier represents the identifier used for Service to Service Authentication + * @param onlinePreTransforms sequential data record transforms applied to Producer of DataRecord before creating AggregateGroup. + * While preTransforms defined at AggregateGroup are applied to each aggregate group, onlinePreTransforms are applied to the whole producer source. + * @param keyedByUserEnabled boolean value to enable/disable merging user-level features from Feature Store + * @param keyedByAuthorEnabled boolean value to enable/disable merging author-level features from Feature Store + * @param enableUserReindexingNighthawkBtreeStore boolean value to enable reindexing RTAs on user id with btree backed nighthawk + * @param enableUserReindexingNighthawkHashStore boolean value to enable reindexing RTAs on user id with hash backed nighthawk + * @param userReindexingNighthawkBtreeStoreConfig NH btree store config used in reindexing user RTAs + * @param userReindexingNighthawkHashStoreConfig NH hash store config used in reindexing user RTAs + */ +case class RealTimeAggregatesJobConfig( + appId: String, + topologyWorkers: Int, + sourceCount: Int, + summerCount: Int, + cacheSize: Int, + flatMapCount: Int, + containerRamGigaBytes: Int, + name: String, + teamName: String, + teamEmail: String, + componentsToKerberize: Seq[String] = Seq.empty, + componentToMetaSpaceSizeMap: Map[String, String] = Map.empty, + componentToRamGigaBytesMap: Map[String, Int] = Map("Tail" -> 4), + topologyNamedOptions: Map[String, Options] = Map.empty, + serviceIdentifier: ServiceIdentifier = EmptyServiceIdentifier, + onlinePreTransforms: Seq[OneToSomeTransform] = Seq.empty, + keyedByUserEnabled: Boolean = false, + keyedByAuthorEnabled: Boolean = false, + keyedByTweetEnabled: Boolean = false, + enableUserReindexingNighthawkBtreeStore: Boolean = false, + enableUserReindexingNighthawkHashStore: Boolean = false, + userReindexingNighthawkBtreeStoreConfig: NighthawkUnderlyingStoreConfig = + NighthawkUnderlyingStoreConfig(), + userReindexingNighthawkHashStoreConfig: NighthawkUnderlyingStoreConfig = + NighthawkUnderlyingStoreConfig()) { + + /** + * Apply transforms sequentially. If any transform results in a dropped (None) + * DataRecord, then entire transform sequence will result in a dropped DataRecord. + * Note that transforms are order-dependent. + */ + def sequentiallyTransform(dataRecord: DataRecord): Option[DataRecord] = { + val recordOpt = Option(new DataRecord(dataRecord)) + onlinePreTransforms.foldLeft(recordOpt) { + case (Some(previousRecord), preTransform) => + preTransform(previousRecord) + case _ => Option.empty[DataRecord] + } + } +} + +trait RealTimeAggregatesJobConfigs { + def Prod: RealTimeAggregatesJobConfig + def Devel: RealTimeAggregatesJobConfig +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/heron/StormAggregateSource.scala b/timelines/data_processing/ml_util/aggregation_framework/heron/StormAggregateSource.scala new file mode 100644 index 0000000000..a252cf197f --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/heron/StormAggregateSource.scala @@ -0,0 +1,27 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron + +import com.twitter.finagle.stats.StatsReceiver +import com.twitter.ml.api.DataRecord +import com.twitter.ml.api.Feature +import com.twitter.summingbird._ +import com.twitter.summingbird.storm.Storm +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregateSource +import java.lang.{Long => JLong} + +/** + * Use this trait to implement online summingbird producer that subscribes to + * spouts and generates a data record. + */ +trait StormAggregateSource extends AggregateSource { + def name: String + + def timestampFeature: Feature[JLong] + + /** + * Constructs the storm Producer with the implemented topology at runtime. + */ + def build( + statsReceiver: StatsReceiver, + jobConfig: RealTimeAggregatesJobConfig + ): Producer[Storm, DataRecord] +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/heron/UserReindexingNighthawkStore.scala b/timelines/data_processing/ml_util/aggregation_framework/heron/UserReindexingNighthawkStore.scala new file mode 100644 index 0000000000..a4d2adeacd --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/heron/UserReindexingNighthawkStore.scala @@ -0,0 +1,309 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron + +import com.twitter.bijection.Injection +import com.twitter.bijection.thrift.CompactThriftCodec +import com.twitter.cache.client._ +import com.twitter.finagle.stats.StatsReceiver +import com.twitter.ml.api.DataRecord +import com.twitter.ml.api.constant.SharedFeatures +import com.twitter.ml.api.util.SRichDataRecord +import com.twitter.storehaus.WritableStore +import com.twitter.storehaus_internal.nighthawk_kv.CacheClientNighthawkConfig +import com.twitter.storehaus_internal.nighthawk_kv.NighthawkStore +import com.twitter.summingbird.batch.BatchID +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron.UserReindexingNighthawkWritableDataRecordStore._ +import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures +import com.twitter.util.Future +import com.twitter.util.Time +import com.twitter.util.Try +import com.twitter.util.logging.Logger +import java.nio.ByteBuffer +import java.util +import scala.util.Random + +object UserReindexingNighthawkWritableDataRecordStore { + implicit val longInjection = Injection.long2BigEndian + implicit val dataRecordInjection: Injection[DataRecord, Array[Byte]] = + CompactThriftCodec[DataRecord] + val arrayToByteBuffer = Injection.connect[Array[Byte], ByteBuffer] + val longToByteBuffer = longInjection.andThen(arrayToByteBuffer) + val dataRecordToByteBuffer = dataRecordInjection.andThen(arrayToByteBuffer) + + def getBtreeStore( + nighthawkCacheConfig: CacheClientNighthawkConfig, + targetSize: Int, + statsReceiver: StatsReceiver, + trimRate: Double + ): UserReindexingNighthawkBtreeWritableDataRecordStore = + new UserReindexingNighthawkBtreeWritableDataRecordStore( + nighthawkStore = NighthawkStore[UserId, TimestampMs, DataRecord](nighthawkCacheConfig) + .asInstanceOf[NighthawkStore[UserId, TimestampMs, DataRecord]], + tableName = nighthawkCacheConfig.table.toString, + targetSize = targetSize, + statsReceiver = statsReceiver, + trimRate = trimRate + ) + + def getHashStore( + nighthawkCacheConfig: CacheClientNighthawkConfig, + targetSize: Int, + statsReceiver: StatsReceiver, + trimRate: Double + ): UserReindexingNighthawkHashWritableDataRecordStore = + new UserReindexingNighthawkHashWritableDataRecordStore( + nighthawkStore = NighthawkStore[UserId, AuthorId, DataRecord](nighthawkCacheConfig) + .asInstanceOf[NighthawkStore[UserId, AuthorId, DataRecord]], + tableName = nighthawkCacheConfig.table.toString, + targetSize = targetSize, + statsReceiver = statsReceiver, + trimRate = trimRate + ) + + def buildTimestampedByteBuffer(timestamp: Long, bb: ByteBuffer): ByteBuffer = { + val timestampedBb = ByteBuffer.allocate(getLength(bb) + java.lang.Long.SIZE) + timestampedBb.putLong(timestamp) + timestampedBb.put(bb) + timestampedBb + } + + def extractTimestampFromTimestampedByteBuffer(bb: ByteBuffer): Long = { + bb.getLong(0) + } + + def extractValueFromTimestampedByteBuffer(bb: ByteBuffer): ByteBuffer = { + val bytes = new Array[Byte](getLength(bb) - java.lang.Long.SIZE) + util.Arrays.copyOfRange(bytes, java.lang.Long.SIZE, getLength(bb)) + ByteBuffer.wrap(bytes) + } + + def transformAndBuildKeyValueMapping( + table: String, + userId: UserId, + authorIdsAndDataRecords: Seq[(AuthorId, DataRecord)] + ): KeyValue = { + val timestamp = Time.now.inMillis + val pkey = longToByteBuffer(userId) + val lkeysAndTimestampedValues = authorIdsAndDataRecords.map { + case (authorId, dataRecord) => + val lkey = longToByteBuffer(authorId) + // Create a byte buffer with a prepended timestamp to reduce deserialization cost + // when parsing values. We only have to extract and deserialize the timestamp in the + // ByteBuffer in order to sort the value, as opposed to deserializing the DataRecord + // and having to get a timestamp feature value from the DataRecord. + val dataRecordBb = dataRecordToByteBuffer(dataRecord) + val timestampedValue = buildTimestampedByteBuffer(timestamp, dataRecordBb) + (lkey, timestampedValue) + } + buildKeyValueMapping(table, pkey, lkeysAndTimestampedValues) + } + + def buildKeyValueMapping( + table: String, + pkey: ByteBuffer, + lkeysAndTimestampedValues: Seq[(ByteBuffer, ByteBuffer)] + ): KeyValue = { + val lkeys = lkeysAndTimestampedValues.map { case (lkey, _) => lkey } + val timestampedValues = lkeysAndTimestampedValues.map { case (_, value) => value } + val kv = KeyValue( + key = Key(table = table, pkey = pkey, lkeys = lkeys), + value = Value(timestampedValues) + ) + kv + } + + private def getLength(bb: ByteBuffer): Int = { + // capacity can be an over-estimate of the actual length (remaining - start position) + // but it's the safest to avoid overflows. + bb.capacity() + } +} + +/** + * Implements a NH store that stores aggregate feature DataRecords using userId as the primary key. + * + * This store re-indexes user-author keyed real-time aggregate (RTA) features on userId by + * writing to a userId primary key (pkey) and timestamp secondary key (lkey). To fetch user-author + * RTAs for a given user from cache, the caller just needs to make a single RPC for the userId pkey. + * The downside of a re-indexing store is that we cannot store arbitrarily many secondary keys + * under the primary key. This specific implementation using the NH btree backend also mandates + * mandates an ordering of secondary keys - we therefore use timestamp as the secondary key + * as opposed to say authorId. + * + * Note that a caller of the btree backed NH re-indexing store receives back a response where the + * secondary key is a timestamp. The associated value is a DataRecord containing user-author related + * aggregate features which was last updated at the timestamp. The caller therefore needs to handle + * the response and dedupe on unique, most recent user-author pairs. + * + * For a discussion on this and other implementations, please see: + * https://docs.google.com/document/d/1yVzAbQ_ikLqwSf230URxCJmSKj5yZr5dYv6TwBlQw18/edit + */ +class UserReindexingNighthawkBtreeWritableDataRecordStore( + nighthawkStore: NighthawkStore[UserId, TimestampMs, DataRecord], + tableName: String, + targetSize: Int, + statsReceiver: StatsReceiver, + trimRate: Double = 0.1 // by default, trim on 10% of puts +) extends WritableStore[(AggregationKey, BatchID), Option[DataRecord]] { + + private val scope = getClass.getSimpleName + private val failures = statsReceiver.counter(scope, "failures") + private val log = Logger.getLogger(getClass) + private val random: Random = new Random(1729L) + + override def put(kv: ((AggregationKey, BatchID), Option[DataRecord])): Future[Unit] = { + val ((aggregationKey, _), dataRecordOpt) = kv + // Fire-and-forget below because the store itself should just be a side effect + // as it's just making re-indexed writes based on the writes to the primary store. + for { + userId <- aggregationKey.discreteFeaturesById.get(SharedFeatures.USER_ID.getFeatureId) + dataRecord <- dataRecordOpt + } yield { + SRichDataRecord(dataRecord) + .getFeatureValueOpt(TypedAggregateGroup.timestampFeature) + .map(_.toLong) // convert to Scala Long + .map { timestamp => + val trim: Future[Unit] = if (random.nextDouble <= trimRate) { + val trimKey = TrimKey( + table = tableName, + pkey = longToByteBuffer(userId), + targetSize = targetSize, + ascending = true + ) + nighthawkStore.client.trim(Seq(trimKey)).unit + } else { + Future.Unit + } + // We should wait for trim to complete above + val fireAndForget = trim.before { + val kvTuple = ((userId, timestamp), Some(dataRecord)) + nighthawkStore.put(kvTuple) + } + + fireAndForget.onFailure { + case e => + failures.incr() + log.error("Failure in UserReindexingNighthawkHashWritableDataRecordStore", e) + } + } + } + // Ignore fire-and-forget result above and simply return + Future.Unit + } +} + +/** + * Implements a NH store that stores aggregate feature DataRecords using userId as the primary key. + * + * This store re-indexes user-author keyed real-time aggregate (RTA) features on userId by + * writing to a userId primary key (pkey) and authorId secondary key (lkey). To fetch user-author + * RTAs for a given user from cache, the caller just needs to make a single RPC for the userId pkey. + * The downside of a re-indexing store is that we cannot store arbitrarily + * many secondary keys under the primary key. We have to limit them in some way; + * here, we do so by randomly (based on trimRate) issuing an HGETALL command (via scan) to + * retrieve the whole hash, sort by oldest timestamp, and then remove the oldest authors to keep + * only targetSize authors (aka trim), where targetSize is configurable. + * + * @note The full hash returned from scan could be as large (or even larger) than targetSize, + * which could mean many DataRecords to deserialize, especially at high write qps. + * To reduce deserialization cost post-scan, we use timestamped values with a prepended timestamp + * in the value ByteBuffer; this allows us to only deserialize the timestamp and not the full + * DataRecord when sorting. This is necessary in order to identify the oldest values to trim. + * When we do a put for a new (user, author) pair, we also write out timestamped values. + * + * For a discussion on this and other implementations, please see: + * https://docs.google.com/document/d/1yVzAbQ_ikLqwSf230URxCJmSKj5yZr5dYv6TwBlQw18/edit + */ +class UserReindexingNighthawkHashWritableDataRecordStore( + nighthawkStore: NighthawkStore[UserId, AuthorId, DataRecord], + tableName: String, + targetSize: Int, + statsReceiver: StatsReceiver, + trimRate: Double = 0.1 // by default, trim on 10% of puts +) extends WritableStore[(AggregationKey, BatchID), Option[DataRecord]] { + + private val scope = getClass.getSimpleName + private val scanMismatchErrors = statsReceiver.counter(scope, "scanMismatchErrors") + private val failures = statsReceiver.counter(scope, "failures") + private val log = Logger.getLogger(getClass) + private val random: Random = new Random(1729L) + private val arrayToByteBuffer = Injection.connect[Array[Byte], ByteBuffer] + private val longToByteBuffer = Injection.long2BigEndian.andThen(arrayToByteBuffer) + + override def put(kv: ((AggregationKey, BatchID), Option[DataRecord])): Future[Unit] = { + val ((aggregationKey, _), dataRecordOpt) = kv + // Fire-and-forget below because the store itself should just be a side effect + // as it's just making re-indexed writes based on the writes to the primary store. + for { + userId <- aggregationKey.discreteFeaturesById.get(SharedFeatures.USER_ID.getFeatureId) + authorId <- aggregationKey.discreteFeaturesById.get( + TimelinesSharedFeatures.SOURCE_AUTHOR_ID.getFeatureId) + dataRecord <- dataRecordOpt + } yield { + val scanAndTrim: Future[Unit] = if (random.nextDouble <= trimRate) { + val scanKey = ScanKey( + table = tableName, + pkey = longToByteBuffer(userId) + ) + nighthawkStore.client.scan(Seq(scanKey)).flatMap { scanResults: Seq[Try[KeyValue]] => + scanResults.headOption + .flatMap(_.toOption).map { keyValue: KeyValue => + val lkeys: Seq[ByteBuffer] = keyValue.key.lkeys + // these are timestamped bytebuffers + val timestampedValues: Seq[ByteBuffer] = keyValue.value.values + // this should fail loudly if this is not true. it would indicate + // there is a mistake in the scan. + if (lkeys.size != timestampedValues.size) scanMismatchErrors.incr() + assert(lkeys.size == timestampedValues.size) + if (lkeys.size > targetSize) { + val numToRemove = targetSize - lkeys.size + // sort by oldest and take top k oldest and remove - this is equivalent to a trim + val oldestKeys: Seq[ByteBuffer] = lkeys + .zip(timestampedValues) + .map { + case (lkey, timestampedValue) => + val timestamp = extractTimestampFromTimestampedByteBuffer(timestampedValue) + (timestamp, lkey) + } + .sortBy { case (timestamp, _) => timestamp } + .take(numToRemove) + .map { case (_, k) => k } + val pkey = longToByteBuffer(userId) + val key = Key(table = tableName, pkey = pkey, lkeys = oldestKeys) + // NOTE: `remove` is a batch API, and we group all lkeys into a single batch (batch + // size = single group of lkeys = 1). Instead, we could separate lkeys into smaller + // groups and have batch size = number of groups, but this is more complex. + // Performance implications of batching vs non-batching need to be assessed. + nighthawkStore.client + .remove(Seq(key)) + .map { responses => + responses.map(resp => nighthawkStore.processValue(resp)) + }.unit + } else { + Future.Unit + } + }.getOrElse(Future.Unit) + } + } else { + Future.Unit + } + // We should wait for scan and trim to complete above + val fireAndForget = scanAndTrim.before { + val kv = transformAndBuildKeyValueMapping(tableName, userId, Seq((authorId, dataRecord))) + nighthawkStore.client + .put(Seq(kv)) + .map { responses => + responses.map(resp => nighthawkStore.processValue(resp)) + }.unit + } + fireAndForget.onFailure { + case e => + failures.incr() + log.error("Failure in UserReindexingNighthawkHashWritableDataRecordStore", e) + } + } + // Ignore fire-and-forget result above and simply return + Future.Unit + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/heron/package.scala b/timelines/data_processing/ml_util/aggregation_framework/heron/package.scala new file mode 100644 index 0000000000..e995cf202b --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/heron/package.scala @@ -0,0 +1,8 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework + +package object heron { + // NOTE: please sort alphabetically + type AuthorId = Long + type UserId = Long + type TimestampMs = Long +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/job/AggregatesV2Job.scala b/timelines/data_processing/ml_util/aggregation_framework/job/AggregatesV2Job.scala new file mode 100644 index 0000000000..7d9e1946e8 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/job/AggregatesV2Job.scala @@ -0,0 +1,163 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.job + +import com.twitter.algebird.Semigroup +import com.twitter.ml.api.DataRecord +import com.twitter.ml.api.DataRecordMerger +import com.twitter.summingbird.Platform +import com.twitter.summingbird.Producer +import com.twitter.summingbird.TailProducer +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregateSource +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregateStore +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup + +object AggregatesV2Job { + private lazy val merger = new DataRecordMerger + + /** + * Merges all "incremental" records with the same aggregation key + * into a single record. + * + * @param recordsPerKey A set of (AggregationKey, DataRecord) tuples + * known to share the same AggregationKey + * @return A single merged datarecord + */ + def mergeRecords(recordsPerKey: Set[(AggregationKey, DataRecord)]): DataRecord = + recordsPerKey.foldLeft(new DataRecord) { + case (merged: DataRecord, (key: AggregationKey, elem: DataRecord)) => { + merger.merge(merged, elem) + merged + } + } + + /** + * Given a set of aggregates to compute and a datarecord, extract key-value + * pairs to output to the summingbird store. + * + * @param dataRecord input data record + * @param aggregates set of aggregates to compute + * @param featureCounters counters to apply to each input data record + * @return computed aggregates + */ + def computeAggregates( + dataRecord: DataRecord, + aggregates: Set[TypedAggregateGroup[_]], + featureCounters: Seq[DataRecordFeatureCounter] + ): Map[AggregationKey, DataRecord] = { + val computedAggregates = aggregates + .flatMap(_.computeAggregateKVPairs(dataRecord)) + .groupBy { case (aggregationKey: AggregationKey, _) => aggregationKey } + .mapValues(mergeRecords) + + featureCounters.foreach(counter => + computedAggregates.map(agg => DataRecordFeatureCounter(counter, agg._2))) + + computedAggregates + + } + + /** + * Util method to apply a filter on containment in an optional set. + * + * @param setOptional Optional set of items to check containment in. + * @param toCheck Item to check if contained in set. + * @return If the optional set is None, returns true. + */ + def setFilter[T](setOptional: Option[Set[T]], toCheck: T): Boolean = + setOptional.map(_.contains(toCheck)).getOrElse(true) + + /** + * Util for filtering a collection of `TypedAggregateGroup` + * + * @param aggregates a set of aggregates + * @param sourceNames Optional filter on which AggregateGroups to process + * based on the name of the input source. + * @param storeNames Optional filter on which AggregateGroups to process + * based on the name of the output store. + * @return filtered aggregates + */ + def filterAggregates( + aggregates: Set[TypedAggregateGroup[_]], + sourceNames: Option[Set[String]], + storeNames: Option[Set[String]] + ): Set[TypedAggregateGroup[_]] = + aggregates + .filter { aggregateGroup => + val sourceName = aggregateGroup.inputSource.name + val storeName = aggregateGroup.outputStore.name + val containsSource = setFilter(sourceNames, sourceName) + val containsStore = setFilter(storeNames, storeName) + containsSource && containsStore + } + + /** + * The core summingbird job code. + * + * For each aggregate in the set passed in, the job + * processes all datarecords in the input producer + * stream to generate "incremental" contributions to + * these aggregates, and emits them grouped by + * aggregation key so that summingbird can aggregate them. + * + * It is important that after applying the sourceNameFilter and storeNameFilter, + * all the result AggregateGroups share the same startDate, otherwise the job + * will fail or give invalid results. + * + * @param aggregateSet A set of aggregates to compute. All aggregates + * in this set that pass the sourceNameFilter and storeNameFilter + * defined below, if any, will be computed. + * @param aggregateSourceToSummingbird Function that maps from our logical + * AggregateSource abstraction to the underlying physical summingbird + * producer of data records to aggregate (e.g. scalding/eventbus source) + * @param aggregateStoreToSummingbird Function that maps from our logical + * AggregateStore abstraction to the underlying physical summingbird + * store to write output aggregate records to (e.g. mahattan for scalding, + * or memcache for heron) + * @param featureCounters counters to use with each input DataRecord + * @return summingbird tail producer + */ + def generateJobGraph[P <: Platform[P]]( + aggregateSet: Set[TypedAggregateGroup[_]], + aggregateSourceToSummingbird: AggregateSource => Option[Producer[P, DataRecord]], + aggregateStoreToSummingbird: AggregateStore => Option[P#Store[AggregationKey, DataRecord]], + featureCounters: Seq[DataRecordFeatureCounter] = Seq.empty + )( + implicit semigroup: Semigroup[DataRecord] + ): TailProducer[P, Any] = { + val tailProducerList: List[TailProducer[P, Any]] = aggregateSet + .groupBy { aggregate => (aggregate.inputSource, aggregate.outputStore) } + .flatMap { + case ( + (inputSource: AggregateSource, outputStore: AggregateStore), + aggregatesInThisStore + ) => { + val producerOpt = aggregateSourceToSummingbird(inputSource) + val storeOpt = aggregateStoreToSummingbird(outputStore) + + (producerOpt, storeOpt) match { + case (Some(producer), Some(store)) => + Some( + producer + .flatMap(computeAggregates(_, aggregatesInThisStore, featureCounters)) + .name("FLATMAP") + .sumByKey(store) + .name("SUMMER") + ) + case _ => None + } + } + } + .toList + + tailProducerList.reduceLeft { (left, right) => left.also(right) } + } + + def aggregateNames(aggregateSet: Set[TypedAggregateGroup[_]]) = { + aggregateSet + .map(typedGroup => + ( + typedGroup.aggregatePrefix, + typedGroup.individualAggregateDescriptors + .flatMap(_.outputFeatures.map(_.getFeatureName)).mkString(","))) + }.toMap +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/job/BUILD b/timelines/data_processing/ml_util/aggregation_framework/job/BUILD new file mode 100644 index 0000000000..57593fa343 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/job/BUILD @@ -0,0 +1,19 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "3rdparty/jvm/com/twitter/algebird:core", + "3rdparty/jvm/com/twitter/algebird:util", + "3rdparty/jvm/com/twitter/storehaus:algebra", + "3rdparty/jvm/com/twitter/storehaus:core", + "3rdparty/src/jvm/com/twitter/scalding:commons", + "3rdparty/src/jvm/com/twitter/scalding:core", + "3rdparty/src/jvm/com/twitter/summingbird:batch", + "3rdparty/src/jvm/com/twitter/summingbird:core", + "src/java/com/twitter/ml/api:api-base", + "src/thrift/com/twitter/ml/api:data-java", + "src/thrift/com/twitter/ml/api:interpretable-model-java", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + ], +) diff --git a/timelines/data_processing/ml_util/aggregation_framework/job/DataRecordFeatureCounter.scala b/timelines/data_processing/ml_util/aggregation_framework/job/DataRecordFeatureCounter.scala new file mode 100644 index 0000000000..eb1580a11a --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/job/DataRecordFeatureCounter.scala @@ -0,0 +1,39 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.job + +import com.twitter.ml.api.DataRecord +import com.twitter.summingbird.Counter + +/** + * A summingbird Counter which is associated with a predicate which operates on + * [[com.twitter.ml.api.DataRecord]] instances. + * + * For example, for a data record which represents a Tweet, one could define a predicate + * which checks whether the Tweet contains a binary feature representing the presence of + * an image. The counter can then be used to represent the the count of Tweets with + * images processed. + * + * @param predicate a predicate which gates the counter + * @param counter a summingbird Counter instance + */ +case class DataRecordFeatureCounter(predicate: DataRecord => Boolean, counter: Counter) + +object DataRecordFeatureCounter { + + /** + * Increments the counter if the record satisfies the predicate + * + * @param recordCounter a data record counter + * @param record a data record + */ + def apply(recordCounter: DataRecordFeatureCounter, record: DataRecord): Unit = + if (recordCounter.predicate(record)) recordCounter.counter.incr() + + /** + * Defines a feature counter with a predicate that is always true + * + * @param counter a summingbird Counter instance + * @return a data record counter + */ + def any(counter: Counter): DataRecordFeatureCounter = + DataRecordFeatureCounter({ _: DataRecord => true }, counter) +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/metrics/AggregateFeature.scala b/timelines/data_processing/ml_util/aggregation_framework/metrics/AggregateFeature.scala new file mode 100644 index 0000000000..4f80490bcf --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/metrics/AggregateFeature.scala @@ -0,0 +1,51 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics + +import com.twitter.util.Duration +import com.twitter.ml.api._ +import java.lang.{Boolean => JBoolean} + +/** + * Case class used as shared argument for + * getAggregateValue() and setAggregateValue() in AggregationMetric. + * + * @param aggregatePrefix Prefix for aggregate feature name + * @param feature Simple (non-aggregate) feature being aggregated. This + is optional; if None, then the label is aggregated on its own without + being crossed with any feature. + * @param label Label being paired with. This is optional; if None, then + the feature is aggregated on its own without being crossed with any label. + * @param halfLife Half life being used for aggregation + */ +case class AggregateFeature[T]( + aggregatePrefix: String, + feature: Option[Feature[T]], + label: Option[Feature[JBoolean]], + halfLife: Duration) { + val aggregateType = "pair" + val labelName: String = label.map(_.getDenseFeatureName()).getOrElse("any_label") + val featureName: String = feature.map(_.getDenseFeatureName()).getOrElse("any_feature") + + /* + * This val precomputes a portion of the feature name + * for faster processing. String building turns + * out to be a significant bottleneck. + */ + val featurePrefix: String = List( + aggregatePrefix, + aggregateType, + labelName, + featureName, + halfLife.toString + ).mkString(".") +} + +/* Companion object with util methods. */ +object AggregateFeature { + def parseHalfLife(aggregateFeature: Feature[_]): Duration = { + val aggregateComponents = aggregateFeature.getDenseFeatureName().split("\\.") + val numComponents = aggregateComponents.length + val halfLifeStr = aggregateComponents(numComponents - 3) + "." + + aggregateComponents(numComponents - 2) + Duration.parse(halfLifeStr) + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/metrics/AggregationMetric.scala b/timelines/data_processing/ml_util/aggregation_framework/metrics/AggregationMetric.scala new file mode 100644 index 0000000000..4278c88126 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/metrics/AggregationMetric.scala @@ -0,0 +1,184 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics + +import com.twitter.ml.api._ +import com.twitter.ml.api.constant.SharedFeatures +import com.twitter.ml.api.util.SRichDataRecord +import com.twitter.util.Duration +import java.lang.{Long => JLong} + +/** + * Represents an aggregation operator (e.g. count or mean). + * Override all functions in this trait to implement your own metric. + * The operator is parameterized on an input type T, which is the type + * of feature it aggregates, and a TimedValue[A] which is + * the result type of aggregation for this metric. + */ +trait AggregationMetric[T, A] extends FeatureCache[T] { + /* + * Combines two timed aggregate values ''left'' and ''right'' + * with the specified half life ''halfLife'' to produce a result + * TimedValue + * + * @param left Left timed value + * @param right Right timed value + * @param halfLife Half life to use for adding timed values + * @return Result timed value + */ + def plus(left: TimedValue[A], right: TimedValue[A], halfLife: Duration): TimedValue[A] + + /* + * Gets increment value given a datarecord and a feature. + * + * @param dataRecord to get increment value from. + * @param feature Feature to get increment value for. If None, + then the semantics is to just aggregate the label. + * @param timestampFeature Feature to use as millisecond timestamp + for decayed value aggregation. + * @return The incremental contribution to the aggregate of ''feature'' from ''dataRecord''. + * + * For example, if the aggregation metric is count, the incremental + * contribution is always a TimedValue (1.0, time). If the aggregation metric + * is mean, and the feature is a continuous feature (double), the incremental + * contribution looks like a tuple (value, 1.0, time) + */ + def getIncrementValue( + dataRecord: DataRecord, + feature: Option[Feature[T]], + timestampFeature: Feature[JLong] + ): TimedValue[A] + + /* + * The "zero" value for aggregation. + * For example, the zero is 0 for the count operator. + */ + def zero(timeOpt: Option[Long] = None): TimedValue[A] + + /* + * Gets the value of aggregate feature(s) stored in a datarecord, if any. + * Different aggregate operators might store this info in the datarecord + * differently. E.g. count just stores a count, while mean needs to + * store both a sum and a count, and compile them into a TimedValue. We call + * these features stored in the record "output" features. + * + * @param record Record to get value from + * @param query AggregateFeature (see above) specifying details of aggregate + * @param aggregateOutputs An optional precomputed set of aggregation "output" + * feature hashes for this (query, metric) pair. This can be derived from ''query'', + * but we precompute and pass this in for significantly (approximately 4x = 400%) + * faster performance. If not passed in, the operator should reconstruct these features + * from scratch. + * + * @return The aggregate value if found in ''record'', else the appropriate "zero" + for this type of aggregation. + */ + def getAggregateValue( + record: DataRecord, + query: AggregateFeature[T], + aggregateOutputs: Option[List[JLong]] = None + ): TimedValue[A] + + /* + * Sets the value of aggregate feature(s) in a datarecord. Different operators + * will have different representations (see example above). + * + * @param record Record to set value in + * @param query AggregateFeature (see above) specifying details of aggregate + * @param aggregateOutputs An optional precomputed set of aggregation "output" + * features for this (query, metric) pair. This can be derived from ''query'', + * but we precompute and pass this in for significantly (approximately 4x = 400%) + * faster performance. If not passed in, the operator should reconstruct these features + * from scratch. + * + * @param value Value to set for aggregate feature in the record being passed in via ''query'' + */ + def setAggregateValue( + record: DataRecord, + query: AggregateFeature[T], + aggregateOutputs: Option[List[JLong]] = None, + value: TimedValue[A] + ): Unit + + /** + * Get features used to store aggregate output representation + * in partially aggregated data records. + * + * @query AggregateFeature (see above) specifying details of aggregate + * @return A list of "output" features used by this metric to store + * output representation. For example, for the "count" operator, we + * have only one element in this list, which is the result "count" feature. + * For the "mean" operator, we have three elements in this list: the "count" + * feature, the "sum" feature and the "mean" feature. + */ + def getOutputFeatures(query: AggregateFeature[T]): List[Feature[_]] + + /** + * Get feature hashes used to store aggregate output representation + * in partially aggregated data records. + * + * @query AggregateFeature (see above) specifying details of aggregate + * @return A list of "output" feature hashes used by this metric to store + * output representation. For example, for the "count" operator, we + * have only one element in this list, which is the result "count" feature. + * For the "mean" operator, we have three elements in this list: the "count" + * feature, the "sum" feature and the "mean" feature. + */ + def getOutputFeatureIds(query: AggregateFeature[T]): List[JLong] = + getOutputFeatures(query) + .map(_.getDenseFeatureId().asInstanceOf[JLong]) + + /* + * Sums the given feature in two datarecords into a result record + * WARNING: this method has side-effects; it modifies combined + * + * @param combined Result datarecord to mutate and store addition result in + * @param left Left datarecord to add + * @param right Right datarecord to add + * @param query Details of aggregate to add + * @param aggregateOutputs An optional precomputed set of aggregation "output" + * feature hashes for this (query, metric) pair. This can be derived from ''query'', + * but we precompute and pass this in for significantly (approximately 4x = 400%) + * faster performance. If not passed in, the operator should reconstruct these features + * from scratch. + */ + def mutatePlus( + combined: DataRecord, + left: DataRecord, + right: DataRecord, + query: AggregateFeature[T], + aggregateOutputs: Option[List[JLong]] = None + ): Unit = { + val leftValue = getAggregateValue(left, query, aggregateOutputs) + val rightValue = getAggregateValue(right, query, aggregateOutputs) + val combinedValue = plus(leftValue, rightValue, query.halfLife) + setAggregateValue(combined, query, aggregateOutputs, combinedValue) + } + + /** + * Helper function to get increment value from an input DataRecord + * and copy it to an output DataRecord, given an AggregateFeature query spec. + * + * @param output Datarecord to output increment to (will be mutated by this method) + * @param input Datarecord to get increment from + * @param query Details of aggregation + * @param aggregateOutputs An optional precomputed set of aggregation "output" + * feature hashes for this (query, metric) pair. This can be derived from ''query'', + * but we precompute and pass this in for significantly (approximately 4x = 400%) + * faster performance. If not passed in, the operator should reconstruct these features + * from scratch. + * @return True if an increment was set in the output record, else false + */ + def setIncrement( + output: DataRecord, + input: DataRecord, + query: AggregateFeature[T], + timestampFeature: Feature[JLong] = SharedFeatures.TIMESTAMP, + aggregateOutputs: Option[List[JLong]] = None + ): Boolean = { + if (query.label == None || + (query.label.isDefined && SRichDataRecord(input).hasFeature(query.label.get))) { + val incrementValue: TimedValue[A] = getIncrementValue(input, query.feature, timestampFeature) + setAggregateValue(output, query, aggregateOutputs, incrementValue) + true + } else false + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/metrics/AggregationMetricCommon.scala b/timelines/data_processing/ml_util/aggregation_framework/metrics/AggregationMetricCommon.scala new file mode 100644 index 0000000000..e7b97e07bc --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/metrics/AggregationMetricCommon.scala @@ -0,0 +1,55 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics + +import com.twitter.algebird.DecayedValue +import com.twitter.algebird.DecayedValueMonoid +import com.twitter.algebird.Monoid +import com.twitter.dal.personal_data.thriftjava.PersonalDataType +import com.twitter.ml.api._ +import com.twitter.ml.api.constant.SharedFeatures +import com.twitter.ml.api.util.SRichDataRecord +import com.twitter.util.Duration +import java.lang.{Long => JLong} +import java.util.{HashSet => JHashSet} +import java.util.{Set => JSet} + +object AggregationMetricCommon { + /* Shared definitions and utils that can be reused by child classes */ + val Epsilon: Double = 1e-6 + val decayedValueMonoid: Monoid[DecayedValue] = DecayedValueMonoid(Epsilon) + val TimestampHash: JLong = SharedFeatures.TIMESTAMP.getDenseFeatureId() + + def toDecayedValue(tv: TimedValue[Double], halfLife: Duration): DecayedValue = { + DecayedValue.build( + tv.value, + tv.timestamp.inMilliseconds, + halfLife.inMilliseconds + ) + } + + def getTimestamp( + record: DataRecord, + timestampFeature: Feature[JLong] = SharedFeatures.TIMESTAMP + ): Long = { + Option( + SRichDataRecord(record) + .getFeatureValue(timestampFeature) + ).map(_.toLong) + .getOrElse(0L) + } + + /* + * Union the PDTs of the input featureOpts. + * Return null if empty, else the JSet[PersonalDataType] + */ + def derivePersonalDataTypes(features: Option[Feature[_]]*): JSet[PersonalDataType] = { + val unionPersonalDataTypes = new JHashSet[PersonalDataType]() + for { + featureOpt <- features + feature <- featureOpt + pdtSetOptional = feature.getPersonalDataTypes + if pdtSetOptional.isPresent + pdtSet = pdtSetOptional.get + } unionPersonalDataTypes.addAll(pdtSet) + if (unionPersonalDataTypes.isEmpty) null else unionPersonalDataTypes + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/metrics/BUILD b/timelines/data_processing/ml_util/aggregation_framework/metrics/BUILD new file mode 100644 index 0000000000..676b31d813 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/metrics/BUILD @@ -0,0 +1,15 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "3rdparty/jvm/com/twitter/algebird:core", + "src/java/com/twitter/ml/api:api-base", + "src/java/com/twitter/ml/api/constant", + "src/scala/com/twitter/ml/api/util:datarecord", + "src/thrift/com/twitter/dal/personal_data:personal_data-java", + "src/thrift/com/twitter/ml/api:data-java", + "src/thrift/com/twitter/ml/api:interpretable-model-java", + "util/util-core:scala", + ], +) diff --git a/timelines/data_processing/ml_util/aggregation_framework/metrics/ConversionUtils.scala b/timelines/data_processing/ml_util/aggregation_framework/metrics/ConversionUtils.scala new file mode 100644 index 0000000000..b04263ea0e --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/metrics/ConversionUtils.scala @@ -0,0 +1,5 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics + +object ConversionUtils { + def booleanToDouble(value: Boolean): Double = if (value) 1.0 else 0.0 +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/metrics/CountMetric.scala b/timelines/data_processing/ml_util/aggregation_framework/metrics/CountMetric.scala new file mode 100644 index 0000000000..720fa68e59 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/metrics/CountMetric.scala @@ -0,0 +1,41 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics + +import com.twitter.ml.api._ +import com.twitter.ml.api.util.SRichDataRecord +import com.twitter.util.Time +import java.lang.{Long => JLong} + +case class TypedCountMetric[T]( +) extends TypedSumLikeMetric[T] { + import AggregationMetricCommon._ + import ConversionUtils._ + override val operatorName = "count" + + override def getIncrementValue( + record: DataRecord, + feature: Option[Feature[T]], + timestampFeature: Feature[JLong] + ): TimedValue[Double] = { + val featureExists: Boolean = feature match { + case Some(f) => SRichDataRecord(record).hasFeature(f) + case None => true + } + + TimedValue[Double]( + value = booleanToDouble(featureExists), + timestamp = Time.fromMilliseconds(getTimestamp(record, timestampFeature)) + ) + } +} + +/** + * Syntactic sugar for the count metric that works with + * any feature type as opposed to being tied to a specific type. + * See EasyMetric.scala for more details on why this is useful. + */ +object CountMetric extends EasyMetric { + override def forFeatureType[T]( + featureType: FeatureType, + ): Option[AggregationMetric[T, _]] = + Some(TypedCountMetric[T]()) +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/metrics/EasyMetric.scala b/timelines/data_processing/ml_util/aggregation_framework/metrics/EasyMetric.scala new file mode 100644 index 0000000000..67edce7cef --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/metrics/EasyMetric.scala @@ -0,0 +1,34 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics + +import com.twitter.ml.api._ + +/** + * A "human-readable" metric that can be applied to features of multiple + * different types. Wrapper around AggregationMetric used as syntactic sugar + * for easier config. + */ +trait EasyMetric extends Serializable { + /* + * Given a feature type, fetches the corrrect underlying AggregationMetric + * to perform this operation over the given feature type, if any. If no such + * metric is available, returns None. For example, MEAN cannot be applied + * to FeatureType.String and would return None. + * + * @param featureType Type of feature to fetch metric for + * @param useFixedDecay Param to control whether the metric should use fixed decay + * logic (if appropriate) + * @return Strongly typed aggregation metric to use for this feature type + * + * For example, if the EasyMetric is MEAN and the featureType is + * FeatureType.Continuous, the underlying AggregationMetric should be a + * scalar mean. If the EasyMetric is MEAN and the featureType is + * FeatureType.SparseContinuous, the AggregationMetric returned could be a + * "vector" mean that averages sparse maps. Using the single logical name + * MEAN for both is nice syntactic sugar making for an easier to read top + * level config, though different underlying operators are used underneath + * for the actual implementation. + */ + def forFeatureType[T]( + featureType: FeatureType, + ): Option[AggregationMetric[T, _]] +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/metrics/FeatureCache.scala b/timelines/data_processing/ml_util/aggregation_framework/metrics/FeatureCache.scala new file mode 100644 index 0000000000..e5f384100e --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/metrics/FeatureCache.scala @@ -0,0 +1,72 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics + +import com.twitter.ml.api._ +import scala.collection.mutable + +trait FeatureCache[T] { + /* + * Constructs feature names from scratch given an aggregate query and an output + * feature name. E.g. given mean operator and "sum". This function is slow and should + * only be called at pre-computation time. + * + * @param query Details of aggregate feature + * @name Name of "output" feature for which we want to construct feature name + * @return Full name of output feature + */ + private def uncachedFullFeatureName(query: AggregateFeature[T], name: String): String = + List(query.featurePrefix, name).mkString(".") + + /* + * A cache from (aggregate query, output feature name) -> fully qualified feature name + * lazy since it doesn't need to be serialized to the mappers + */ + private lazy val featureNameCache = mutable.Map[(AggregateFeature[T], String), String]() + + /* + * A cache from (aggregate query, output feature name) -> precomputed output feature + * lazy since it doesn't need to be serialized to the mappers + */ + private lazy val featureCache = mutable.Map[(AggregateFeature[T], String), Feature[_]]() + + /** + * Given an (aggregate query, output feature name, output feature type), + * look it up using featureNameCache and featureCache, falling back to uncachedFullFeatureName() + * as a last resort to construct a precomputed output feature. Should only be + * called at pre-computation time. + * + * @param query Details of aggregate feature + * @name Name of "output" feature we want to precompute + * @aggregateFeatureType type of "output" feature we want to precompute + */ + def cachedFullFeature( + query: AggregateFeature[T], + name: String, + aggregateFeatureType: FeatureType + ): Feature[_] = { + lazy val cachedFeatureName = featureNameCache.getOrElseUpdate( + (query, name), + uncachedFullFeatureName(query, name) + ) + + def uncachedFullFeature(): Feature[_] = { + val personalDataTypes = + AggregationMetricCommon.derivePersonalDataTypes(query.feature, query.label) + + aggregateFeatureType match { + case FeatureType.BINARY => new Feature.Binary(cachedFeatureName, personalDataTypes) + case FeatureType.DISCRETE => new Feature.Discrete(cachedFeatureName, personalDataTypes) + case FeatureType.STRING => new Feature.Text(cachedFeatureName, personalDataTypes) + case FeatureType.CONTINUOUS => new Feature.Continuous(cachedFeatureName, personalDataTypes) + case FeatureType.SPARSE_BINARY => + new Feature.SparseBinary(cachedFeatureName, personalDataTypes) + case FeatureType.SPARSE_CONTINUOUS => + new Feature.SparseContinuous(cachedFeatureName, personalDataTypes) + } + } + + featureCache.getOrElseUpdate( + (query, name), + uncachedFullFeature() + ) + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/metrics/LastResetMetric.scala b/timelines/data_processing/ml_util/aggregation_framework/metrics/LastResetMetric.scala new file mode 100644 index 0000000000..67fe444aaf --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/metrics/LastResetMetric.scala @@ -0,0 +1,107 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics + +import java.lang.{Long => JLong} +import com.twitter.ml.api._ +import com.twitter.ml.api.util.SRichDataRecord +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.ConversionUtils._ +import com.twitter.util.Duration +import com.twitter.util.Time +import scala.math.max + +/** + * This metric measures how recently an action has taken place. A value of 1.0 + * indicates the action happened just now. This value decays with time if the + * action has not taken place and is reset to 1 when the action happens. So lower + * value indicates a stale or older action. + * + * For example consider an action of "user liking a video". The last reset metric + * value changes as follows for a half life of 1 day. + * + * ---------------------------------------------------------------------------- + * day | action | feature value | Description + * ---------------------------------------------------------------------------- + * 1 | user likes the video | 1.0 | Set the value to 1 + * 2 | user does not like video | 0.5 | Decay the value + * 3 | user does not like video | 0.25 | Decay the value + * 4 | user likes the video | 1.0 | Reset the value to 1 + * ----------------------------------------------------------------------------- + * + * @tparam T + */ +case class TypedLastResetMetric[T]() extends TimedValueAggregationMetric[T] { + import AggregationMetricCommon._ + + override val operatorName = "last_reset" + + override def getIncrementValue( + record: DataRecord, + feature: Option[Feature[T]], + timestampFeature: Feature[JLong] + ): TimedValue[Double] = { + val featureExists: Boolean = feature match { + case Some(f) => SRichDataRecord(record).hasFeature(f) + case None => true + } + + TimedValue[Double]( + value = booleanToDouble(featureExists), + timestamp = Time.fromMilliseconds(getTimestamp(record, timestampFeature)) + ) + } + private def getDecayedValue( + olderTimedValue: TimedValue[Double], + newerTimestamp: Time, + halfLife: Duration + ): Double = { + if (halfLife.inMilliseconds == 0L) { + 0.0 + } else { + val timeDelta = newerTimestamp.inMilliseconds - olderTimedValue.timestamp.inMilliseconds + val resultValue = olderTimedValue.value / math.pow(2.0, timeDelta / halfLife.inMillis) + if (resultValue > AggregationMetricCommon.Epsilon) resultValue else 0.0 + } + } + + override def plus( + left: TimedValue[Double], + right: TimedValue[Double], + halfLife: Duration + ): TimedValue[Double] = { + + val (newerTimedValue, olderTimedValue) = if (left.timestamp > right.timestamp) { + (left, right) + } else { + (right, left) + } + + val optionallyDecayedOlderValue = if (halfLife == Duration.Top) { + // Since we don't want to decay, older value is not changed + olderTimedValue.value + } else { + // Decay older value + getDecayedValue(olderTimedValue, newerTimedValue.timestamp, halfLife) + } + + TimedValue[Double]( + value = max(newerTimedValue.value, optionallyDecayedOlderValue), + timestamp = newerTimedValue.timestamp + ) + } + + override def zero(timeOpt: Option[Long]): TimedValue[Double] = TimedValue[Double]( + value = 0.0, + timestamp = Time.fromMilliseconds(0) + ) +} + +/** + * Syntactic sugar for the last reset metric that works with + * any feature type as opposed to being tied to a specific type. + * See EasyMetric.scala for more details on why this is useful. + */ +object LastResetMetric extends EasyMetric { + override def forFeatureType[T]( + featureType: FeatureType + ): Option[AggregationMetric[T, _]] = + Some(TypedLastResetMetric[T]()) +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/metrics/LatestMetric.scala b/timelines/data_processing/ml_util/aggregation_framework/metrics/LatestMetric.scala new file mode 100644 index 0000000000..08bd6483ae --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/metrics/LatestMetric.scala @@ -0,0 +1,69 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics + +import com.twitter.ml.api._ +import com.twitter.ml.api.util.SRichDataRecord +import com.twitter.ml.api.DataRecord +import com.twitter.ml.api.Feature +import com.twitter.ml.api.FeatureType +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon.getTimestamp +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetric +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.EasyMetric +import com.twitter.util.Duration +import com.twitter.util.Time +import java.lang.{Double => JDouble} +import java.lang.{Long => JLong} +import java.lang.{Number => JNumber} + +case class TypedLatestMetric[T <: JNumber](defaultValue: Double = 0.0) + extends TimedValueAggregationMetric[T] { + override val operatorName = "latest" + + override def plus( + left: TimedValue[Double], + right: TimedValue[Double], + halfLife: Duration + ): TimedValue[Double] = { + assert( + halfLife.toString == "Duration.Top", + s"halfLife must be Duration.Top when using latest metric, but ${halfLife.toString} is used" + ) + + if (left.timestamp > right.timestamp) { + left + } else { + right + } + } + + override def getIncrementValue( + dataRecord: DataRecord, + feature: Option[Feature[T]], + timestampFeature: Feature[JLong] + ): TimedValue[Double] = { + val value = feature + .flatMap(SRichDataRecord(dataRecord).getFeatureValueOpt(_)) + .map(_.doubleValue()).getOrElse(defaultValue) + val timestamp = Time.fromMilliseconds(getTimestamp(dataRecord, timestampFeature)) + TimedValue[Double](value = value, timestamp = timestamp) + } + + override def zero(timeOpt: Option[Long]): TimedValue[Double] = + TimedValue[Double]( + value = 0.0, + timestamp = Time.fromMilliseconds(0) + ) +} + +object LatestMetric extends EasyMetric { + override def forFeatureType[T]( + featureType: FeatureType + ): Option[AggregationMetric[T, _]] = { + featureType match { + case FeatureType.CONTINUOUS => + Some(TypedLatestMetric[JDouble]().asInstanceOf[AggregationMetric[T, Double]]) + case FeatureType.DISCRETE => + Some(TypedLatestMetric[JLong]().asInstanceOf[AggregationMetric[T, Double]]) + case _ => None + } + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/metrics/MaxMetric.scala b/timelines/data_processing/ml_util/aggregation_framework/metrics/MaxMetric.scala new file mode 100644 index 0000000000..b9e9176bb9 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/metrics/MaxMetric.scala @@ -0,0 +1,64 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics + +import com.twitter.ml.api._ +import com.twitter.ml.api.util.SRichDataRecord +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon.getTimestamp +import com.twitter.util.Duration +import com.twitter.util.Time +import java.lang.{Long => JLong} +import java.lang.{Number => JNumber} +import java.lang.{Double => JDouble} +import scala.math.max + +case class TypedMaxMetric[T <: JNumber](defaultValue: Double = 0.0) + extends TimedValueAggregationMetric[T] { + override val operatorName = "max" + + override def getIncrementValue( + dataRecord: DataRecord, + feature: Option[Feature[T]], + timestampFeature: Feature[JLong] + ): TimedValue[Double] = { + val value = feature + .flatMap(SRichDataRecord(dataRecord).getFeatureValueOpt(_)) + .map(_.doubleValue()).getOrElse(defaultValue) + val timestamp = Time.fromMilliseconds(getTimestamp(dataRecord, timestampFeature)) + TimedValue[Double](value = value, timestamp = timestamp) + } + + override def plus( + left: TimedValue[Double], + right: TimedValue[Double], + halfLife: Duration + ): TimedValue[Double] = { + + assert( + halfLife.toString == "Duration.Top", + s"halfLife must be Duration.Top when using max metric, but ${halfLife.toString} is used" + ) + + TimedValue[Double]( + value = max(left.value, right.value), + timestamp = left.timestamp.max(right.timestamp) + ) + } + + override def zero(timeOpt: Option[Long]): TimedValue[Double] = + TimedValue[Double]( + value = 0.0, + timestamp = Time.fromMilliseconds(0) + ) +} + +object MaxMetric extends EasyMetric { + def forFeatureType[T]( + featureType: FeatureType, + ): Option[AggregationMetric[T, _]] = + featureType match { + case FeatureType.CONTINUOUS => + Some(TypedMaxMetric[JDouble]().asInstanceOf[AggregationMetric[T, Double]]) + case FeatureType.DISCRETE => + Some(TypedMaxMetric[JLong]().asInstanceOf[AggregationMetric[T, Double]]) + case _ => None + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/metrics/SumLikeMetric.scala b/timelines/data_processing/ml_util/aggregation_framework/metrics/SumLikeMetric.scala new file mode 100644 index 0000000000..1f7aeb58ad --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/metrics/SumLikeMetric.scala @@ -0,0 +1,66 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics + +import com.twitter.ml.api._ +import com.twitter.util.Duration +import com.twitter.util.Time +import java.lang.{Double => JDouble} +import java.lang.{Long => JLong} +import java.util.{Map => JMap} + +/* + * TypedSumLikeMetric aggregates a sum over any feature transform. + * TypedCountMetric, TypedSumMetric, TypedSumSqMetric are examples + * of metrics that are inherited from this trait. To implement a new + * "sum like" metric, override the getIncrementValue() and operatorName + * members of this trait. + * + * getIncrementValue() is inherited from the + * parent trait AggregationMetric, but not overriden in this trait, so + * it needs to be overloaded by any metric that extends TypedSumLikeMetric. + * + * operatorName is a string used for naming the resultant aggregate feature + * (e.g. "count" if its a count feature, or "sum" if a sum feature). + */ +trait TypedSumLikeMetric[T] extends TimedValueAggregationMetric[T] { + import AggregationMetricCommon._ + + def useFixedDecay = true + + override def plus( + left: TimedValue[Double], + right: TimedValue[Double], + halfLife: Duration + ): TimedValue[Double] = { + val resultValue = if (halfLife == Duration.Top) { + /* We could use decayedValueMonoid here, but + * a simple addition is slightly more accurate */ + left.value + right.value + } else { + val decayedLeft = toDecayedValue(left, halfLife) + val decayedRight = toDecayedValue(right, halfLife) + decayedValueMonoid.plus(decayedLeft, decayedRight).value + } + + TimedValue[Double]( + resultValue, + left.timestamp.max(right.timestamp) + ) + } + + override def zero(timeOpt: Option[Long]): TimedValue[Double] = { + val timestamp = + /* + * Please see TQ-11279 for documentation for this fix to the decay logic. + */ + if (useFixedDecay) { + Time.fromMilliseconds(timeOpt.getOrElse(0L)) + } else { + Time.fromMilliseconds(0L) + } + + TimedValue[Double]( + value = 0.0, + timestamp = timestamp + ) + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/metrics/SumMetric.scala b/timelines/data_processing/ml_util/aggregation_framework/metrics/SumMetric.scala new file mode 100644 index 0000000000..bd93d5bae3 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/metrics/SumMetric.scala @@ -0,0 +1,52 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics + +import com.twitter.ml.api._ +import com.twitter.ml.api.util.SRichDataRecord +import com.twitter.util.Time +import java.lang.{Double => JDouble} +import java.lang.{Long => JLong} + +case class TypedSumMetric( +) extends TypedSumLikeMetric[JDouble] { + import AggregationMetricCommon._ + + override val operatorName = "sum" + + /* + * Transform feature -> its value in the given record, + * or 0 when feature = None (sum has no meaning in this case) + */ + override def getIncrementValue( + record: DataRecord, + feature: Option[Feature[JDouble]], + timestampFeature: Feature[JLong] + ): TimedValue[Double] = feature match { + case Some(f) => { + TimedValue[Double]( + value = Option(SRichDataRecord(record).getFeatureValue(f)).map(_.toDouble).getOrElse(0.0), + timestamp = Time.fromMilliseconds(getTimestamp(record, timestampFeature)) + ) + } + + case None => + TimedValue[Double]( + value = 0.0, + timestamp = Time.fromMilliseconds(getTimestamp(record, timestampFeature)) + ) + } +} + +/** + * Syntactic sugar for the sum metric that works with continuous features. + * See EasyMetric.scala for more details on why this is useful. + */ +object SumMetric extends EasyMetric { + override def forFeatureType[T]( + featureType: FeatureType + ): Option[AggregationMetric[T, _]] = + featureType match { + case FeatureType.CONTINUOUS => + Some(TypedSumMetric().asInstanceOf[AggregationMetric[T, Double]]) + case _ => None + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/metrics/SumSqMetric.scala b/timelines/data_processing/ml_util/aggregation_framework/metrics/SumSqMetric.scala new file mode 100644 index 0000000000..b24b163778 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/metrics/SumSqMetric.scala @@ -0,0 +1,53 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics + +import com.twitter.ml.api._ +import com.twitter.ml.api.util.SRichDataRecord +import com.twitter.util.Time +import java.lang.{Double => JDouble} +import java.lang.{Long => JLong} + +case class TypedSumSqMetric() extends TypedSumLikeMetric[JDouble] { + import AggregationMetricCommon._ + + override val operatorName = "sumsq" + + /* + * Transform feature -> its squared value in the given record + * or 0 when feature = None (sumsq has no meaning in this case) + */ + override def getIncrementValue( + record: DataRecord, + feature: Option[Feature[JDouble]], + timestampFeature: Feature[JLong] + ): TimedValue[Double] = feature match { + case Some(f) => { + val featureVal = + Option(SRichDataRecord(record).getFeatureValue(f)).map(_.toDouble).getOrElse(0.0) + TimedValue[Double]( + value = featureVal * featureVal, + timestamp = Time.fromMilliseconds(getTimestamp(record, timestampFeature)) + ) + } + + case None => + TimedValue[Double]( + value = 0.0, + timestamp = Time.fromMilliseconds(getTimestamp(record, timestampFeature)) + ) + } +} + +/** + * Syntactic sugar for the sum of squares metric that works with continuous features. + * See EasyMetric.scala for more details on why this is useful. + */ +object SumSqMetric extends EasyMetric { + override def forFeatureType[T]( + featureType: FeatureType + ): Option[AggregationMetric[T, _]] = + featureType match { + case FeatureType.CONTINUOUS => + Some(TypedSumSqMetric().asInstanceOf[AggregationMetric[T, Double]]) + case _ => None + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/metrics/TimedValue.scala b/timelines/data_processing/ml_util/aggregation_framework/metrics/TimedValue.scala new file mode 100644 index 0000000000..7f9fb50902 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/metrics/TimedValue.scala @@ -0,0 +1,14 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics + +import com.twitter.util.Time + +/** + * Case class wrapping a (value, timestamp) tuple. + * All aggregate metrics must operate over this class + * to ensure we can implement decay and half lives for them. + * This is translated to an algebird DecayedValue under the hood. + * + * @param value Value being wrapped + * @param timestamp Time after epoch at which value is being measured + */ +case class TimedValue[T](value: T, timestamp: Time) diff --git a/timelines/data_processing/ml_util/aggregation_framework/metrics/TimedValueAggregationMetric.scala b/timelines/data_processing/ml_util/aggregation_framework/metrics/TimedValueAggregationMetric.scala new file mode 100644 index 0000000000..f31152a234 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/metrics/TimedValueAggregationMetric.scala @@ -0,0 +1,90 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics + +import com.twitter.ml.api._ +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregateFeature +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.TimedValue +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetric +import com.twitter.util.Duration +import com.twitter.util.Time +import java.lang.{Double => JDouble} +import java.lang.{Long => JLong} +import java.util.{Map => JMap} + +/* + * ContinuousAggregationMetric overrides method AggregationMetric dealing + * with reading and writing continuous values from a data record. + * + * operatorName is a string used for naming the resultant aggregate feature + * (e.g. "count" if its a count feature, or "sum" if a sum feature). + */ +trait TimedValueAggregationMetric[T] extends AggregationMetric[T, Double] { + import AggregationMetricCommon._ + + val operatorName: String + + override def getAggregateValue( + record: DataRecord, + query: AggregateFeature[T], + aggregateOutputs: Option[List[JLong]] = None + ): TimedValue[Double] = { + /* + * We know aggregateOutputs(0) will have the continuous feature, + * since we put it there in getOutputFeatureIds() - see code below. + * This helps us get a 4x speedup. Using any structure more complex + * than a list was also a performance bottleneck. + */ + val featureHash: JLong = aggregateOutputs + .getOrElse(getOutputFeatureIds(query)) + .head + + val continuousValueOption: Option[Double] = Option(record.continuousFeatures) + .flatMap { case jmap: JMap[JLong, JDouble] => Option(jmap.get(featureHash)) } + .map(_.toDouble) + + val timeOption = Option(record.discreteFeatures) + .flatMap { case jmap: JMap[JLong, JLong] => Option(jmap.get(TimestampHash)) } + .map(_.toLong) + + val resultOption: Option[TimedValue[Double]] = (continuousValueOption, timeOption) match { + case (Some(featureValue), Some(timesamp)) => + Some(TimedValue[Double](featureValue, Time.fromMilliseconds(timesamp))) + case _ => None + } + + resultOption.getOrElse(zero(timeOption)) + } + + override def setAggregateValue( + record: DataRecord, + query: AggregateFeature[T], + aggregateOutputs: Option[List[JLong]] = None, + value: TimedValue[Double] + ): Unit = { + /* + * We know aggregateOutputs(0) will have the continuous feature, + * since we put it there in getOutputFeatureIds() - see code below. + * This helps us get a 4x speedup. Using any structure more complex + * than a list was also a performance bottleneck. + */ + val featureHash: JLong = aggregateOutputs + .getOrElse(getOutputFeatureIds(query)) + .head + + /* Only set value if non-zero to save space */ + if (value.value != 0.0) { + record.putToContinuousFeatures(featureHash, value.value) + } + + /* + * We do not set timestamp since that might affect correctness of + * future aggregations due to the decay semantics. + */ + } + + /* Only one feature stored in the aggregated datarecord: the result continuous value */ + override def getOutputFeatures(query: AggregateFeature[T]): List[Feature[_]] = { + val feature = cachedFullFeature(query, operatorName, FeatureType.CONTINUOUS) + List(feature) + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/package.scala b/timelines/data_processing/ml_util/aggregation_framework/package.scala new file mode 100644 index 0000000000..824398a7f3 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/package.scala @@ -0,0 +1,19 @@ +package com.twitter.timelines.data_processing.ml_util + +import com.twitter.ml.api.DataRecord + +package object aggregation_framework { + object AggregateType extends Enumeration { + type AggregateType = Value + val User, UserAuthor, UserEngager, UserMention, UserRequestHour, UserRequestDow, + UserOriginalAuthor, UserList, UserTopic, UserInferredTopic, UserMediaUnderstandingAnnotation = + Value + } + + type AggregateUserEntityKey = (Long, AggregateType.Value, Option[Long]) + + case class MergedRecordsDescriptor( + userId: Long, + keyedRecords: Map[AggregateType.Value, Option[KeyedRecord]], + keyedRecordMaps: Map[AggregateType.Value, Option[KeyedRecordMap]]) +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/query/BUILD b/timelines/data_processing/ml_util/aggregation_framework/query/BUILD new file mode 100644 index 0000000000..97e6d1ea77 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/query/BUILD @@ -0,0 +1,12 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "finagle/finagle-stats", + "src/java/com/twitter/ml/api:api-base", + "src/thrift/com/twitter/ml/api:data-scala", + "src/thrift/com/twitter/ml/api:interpretable-model-java", + "timelines/data_processing/ml_util/aggregation_framework/metrics", + ], +) diff --git a/timelines/data_processing/ml_util/aggregation_framework/query/ScopedAggregateBuilder.scala b/timelines/data_processing/ml_util/aggregation_framework/query/ScopedAggregateBuilder.scala new file mode 100644 index 0000000000..2fcce3312c --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/query/ScopedAggregateBuilder.scala @@ -0,0 +1,159 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.query + +import com.twitter.dal.personal_data.thriftjava.PersonalDataType +import com.twitter.ml.api.DataRecord +import com.twitter.ml.api.Feature +import com.twitter.ml.api.FeatureBuilder +import com.twitter.ml.api.FeatureContext +import com.twitter.ml.api.thriftscala.{DataRecord => ScalaDataRecord} +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon +import java.lang.{Double => JDouble} +import java.lang.{Long => JLong} +import scala.collection.JavaConverters._ + +/** + * Provides methods to build "scoped" aggregates, where base features generated by aggregates + * V2 are scoped with a specific key. + * + * The class provides methods that take a Map of T -> DataRecord, where T is a key type, and + * the DataRecord contains features produced by the aggregation_framework. The methods then + * generate a _new_ DataRecord, containing "scoped" aggregate features, where each scoped + * feature has the value of the scope key in the feature name, and the value of the feature + * is the value of the original aggregate feature in the corresponding value from the original + * Map. + * + * For efficiency reasons, the builder is initialized with the set of features that should be + * scoped and the set of keys for which scoping should be supported. + * + * To understand how scope feature names are constructed, consider the following: + * + * {{{ + * val features = Set( + * new Feature.Continuous("user_injection_aggregate.pair.any_label.any_feature.5.days.count"), + * new Feature.Continuous("user_injection_aggregate.pair.any_label.any_feature.10.days.count") + * ) + * val scopes = Set(SuggestType.Recap, SuggestType.WhoToFollow) + * val scopeName = "InjectionType" + * val scopedAggregateBuilder = ScopedAggregateBuilder(features, scopes, scopeName) + * + * }}} + * + * Then, generated scoped features would be among the following: + * - user_injection_aggregate.scoped.pair.any_label.any_feature.5.days.count/scope_name=InjectionType/scope=Recap + * - user_injection_aggregate.scoped.pair.any_label.any_feature.5.days.count/scope_name=InjectionType/scope=WhoToFollow + * - user_injection_aggregate.scoped.pair.any_label.any_feature.10.days.count/scope_name=InjectionType/scope=Recap + * - user_injection_aggregate.scoped.pair.any_label.any_feature.10.days.count/scope_name=InjectionType/scope=WhoToFollow + * + * @param featuresToScope the set of features for which one should generate scoped versions + * @param scopeKeys the set of scope keys to generate scopes with + * @param scopeName a string indicating what the scopes represent. This is also added to the scoped feature + * @tparam K the type of scope key + */ +class ScopedAggregateBuilder[K]( + featuresToScope: Set[Feature[JDouble]], + scopeKeys: Set[K], + scopeName: String) { + + private[this] def buildScopedAggregateFeature( + baseName: String, + scopeValue: String, + personalDataTypes: java.util.Set[PersonalDataType] + ): Feature[JDouble] = { + val components = baseName.split("\\.").toList + + val newName = (components.head :: "scoped" :: components.tail).mkString(".") + + new FeatureBuilder.Continuous() + .addExtensionDimensions("scope_name", "scope") + .setBaseName(newName) + .setPersonalDataTypes(personalDataTypes) + .extensionBuilder() + .addExtension("scope_name", scopeName) + .addExtension("scope", scopeValue) + .build() + } + + /** + * Index of (base aggregate feature name, key) -> key scoped count feature. + */ + private[this] val keyScopedAggregateMap: Map[(String, K), Feature[JDouble]] = { + featuresToScope.flatMap { feat => + scopeKeys.map { key => + (feat.getFeatureName, key) -> + buildScopedAggregateFeature( + feat.getFeatureName, + key.toString, + AggregationMetricCommon.derivePersonalDataTypes(Some(feat)) + ) + } + }.toMap + } + + type ContinuousFeaturesMap = Map[JLong, JDouble] + + /** + * Create key-scoped features for raw aggregate feature ID to value maps, partitioned by key. + */ + private[this] def buildAggregates(featureMapsByKey: Map[K, ContinuousFeaturesMap]): DataRecord = { + val continuousFeatures = featureMapsByKey + .flatMap { + case (key, featureMap) => + featuresToScope.flatMap { feature => + val newFeatureOpt = keyScopedAggregateMap.get((feature.getFeatureName, key)) + newFeatureOpt.flatMap { newFeature => + featureMap.get(feature.getFeatureId).map(new JLong(newFeature.getFeatureId) -> _) + } + }.toMap + } + + new DataRecord().setContinuousFeatures(continuousFeatures.asJava) + } + + /** + * Create key-scoped features for Java [[DataRecord]] aggregate records partitioned by key. + * + * As an example, if the provided Map includes the key `SuggestType.Recap`, and [[scopeKeys]] + * includes this key, then for a feature "xyz.pair.any_label.any_feature.5.days.count", the method + * will generate the scoped feature "xyz.scoped.pair.any_label.any_feature.5.days.count/scope_name=InjectionType/scope=Recap", + * with the value being the value of the original feature from the Map. + * + * @param aggregatesByKey a map from key to a continuous feature map (ie. feature ID -> Double) + * @return a Java [[DataRecord]] containing key-scoped features + */ + def buildAggregatesJava(aggregatesByKey: Map[K, DataRecord]): DataRecord = { + val featureMapsByKey = aggregatesByKey.mapValues(_.continuousFeatures.asScala.toMap) + buildAggregates(featureMapsByKey) + } + + /** + * Create key-scoped features for Scala [[DataRecord]] aggregate records partitioned by key. + * + * As an example, if the provided Map includes the key `SuggestType.Recap`, and [[scopeKeys]] + * includes this key, then for a feature "xyz.pair.any_label.any_feature.5.days.count", the method + * will generate the scoped feature "xyz.scoped.pair.any_label.any_feature.5.days.count/scope_name=InjectionType/scope=Recap", + * with the value being the value of the original feature from the Map. + * + * This is a convenience method for some use cases where aggregates are read from Scala + * thrift objects. Note that this still returns a Java [[DataRecord]], since most ML API + * use the Java version. + * + * @param aggregatesByKey a map from key to a continuous feature map (ie. feature ID -> Double) + * @return a Java [[DataRecord]] containing key-scoped features + */ + def buildAggregatesScala(aggregatesByKey: Map[K, ScalaDataRecord]): DataRecord = { + val featureMapsByKey = + aggregatesByKey + .mapValues { record => + val featureMap = record.continuousFeatures.getOrElse(Map[Long, Double]()).toMap + featureMap.map { case (k, v) => new JLong(k) -> new JDouble(v) } + } + buildAggregates(featureMapsByKey) + } + + /** + * Returns a [[FeatureContext]] including all possible scoped features generated using this builder. + * + * @return a [[FeatureContext]] containing all scoped features. + */ + def scopedFeatureContext: FeatureContext = new FeatureContext(keyScopedAggregateMap.values.asJava) +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/scalding/AggregateFeaturesMerger.scala b/timelines/data_processing/ml_util/aggregation_framework/scalding/AggregateFeaturesMerger.scala new file mode 100644 index 0000000000..156168a9d2 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/scalding/AggregateFeaturesMerger.scala @@ -0,0 +1,213 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.scalding + +import com.twitter.ml.api._ +import com.twitter.ml.api.constant.SharedFeatures._ +import com.twitter.ml.api.util.SRichDataRecord +import com.twitter.scalding.Stat +import com.twitter.scalding.typed.TypedPipe +import com.twitter.timelines.data_processing.ml_util.aggregation_framework._ +import com.twitter.timelines.data_processing.ml_util.sampling.SamplingUtils + +trait AggregateFeaturesMergerBase { + import Utils._ + + def samplingRateOpt: Option[Double] + def numReducers: Int = 2000 + def numReducersMerge: Int = 20000 + + def aggregationConfig: AggregationConfig + def storeRegister: StoreRegister + def storeMerger: StoreMerger + + def getAggregatePipe(storeName: String): DataSetPipe + def applyMaxSizeByTypeOpt(aggregateType: AggregateType.Value): Option[Int] = Option.empty[Int] + + def usersActiveSourcePipe: TypedPipe[Long] + def numRecords: Stat + def numFilteredRecords: Stat + + /* + * This method should only be called with a storeName that corresponds + * to a user aggregate store. + */ + def extractUserFeaturesMap(storeName: String): TypedPipe[(Long, KeyedRecord)] = { + val aggregateKey = storeRegister.storeNameToTypeMap(storeName) + samplingRateOpt + .map(rate => SamplingUtils.userBasedSample(getAggregatePipe(storeName), rate)) + .getOrElse(getAggregatePipe(storeName)) // must return store with only user aggregates + .records + .map { r: DataRecord => + val record = SRichDataRecord(r) + val userId = record.getFeatureValue(USER_ID).longValue + record.clearFeature(USER_ID) + (userId, KeyedRecord(aggregateKey, r)) + } + } + + /* + * When the secondaryKey being used is a String, then the shouldHash function should be set to true. + * Refactor such that the shouldHash parameter is removed and the behavior + * is defaulted to true. + * + * This method should only be called with a storeName that contains records with the + * desired secondaryKey. We provide secondaryKeyFilterPipeOpt against which secondary + * keys can be filtered to help prune the final merged MH dataset. + */ + def extractSecondaryTuples[T]( + storeName: String, + secondaryKey: Feature[T], + shouldHash: Boolean = false, + maxSizeOpt: Option[Int] = None, + secondaryKeyFilterPipeOpt: Option[TypedPipe[Long]] = None + ): TypedPipe[(Long, KeyedRecordMap)] = { + val aggregateKey = storeRegister.storeNameToTypeMap(storeName) + + val extractedRecordsBySecondaryKey = + samplingRateOpt + .map(rate => SamplingUtils.userBasedSample(getAggregatePipe(storeName), rate)) + .getOrElse(getAggregatePipe(storeName)) + .records + .map { r: DataRecord => + val record = SRichDataRecord(r) + val userId = keyFromLong(r, USER_ID) + val secondaryId = extractSecondary(r, secondaryKey, shouldHash) + record.clearFeature(USER_ID) + record.clearFeature(secondaryKey) + + numRecords.inc() + (userId, secondaryId -> r) + } + + val grouped = + (secondaryKeyFilterPipeOpt match { + case Some(secondaryKeyFilterPipe: TypedPipe[Long]) => + extractedRecordsBySecondaryKey + .map { + // In this step, we swap `userId` with `secondaryId` to join on the `secondaryId` + // It is important to swap them back after the join, otherwise the job will fail. + case (userId, (secondaryId, r)) => + (secondaryId, (userId, r)) + } + .join(secondaryKeyFilterPipe.groupBy(identity)) + .map { + case (secondaryId, ((userId, r), _)) => + numFilteredRecords.inc() + (userId, secondaryId -> r) + } + case _ => extractedRecordsBySecondaryKey + }).group + .withReducers(numReducers) + + maxSizeOpt match { + case Some(maxSize) => + grouped + .take(maxSize) + .mapValueStream(recordsIter => Iterator(KeyedRecordMap(aggregateKey, recordsIter.toMap))) + .toTypedPipe + case None => + grouped + .mapValueStream(recordsIter => Iterator(KeyedRecordMap(aggregateKey, recordsIter.toMap))) + .toTypedPipe + } + } + + def userPipes: Seq[TypedPipe[(Long, KeyedRecord)]] = + storeRegister.allStores.flatMap { storeConfig => + val StoreConfig(storeNames, aggregateType, _) = storeConfig + require(storeMerger.isValidToMerge(storeNames)) + + if (aggregateType == AggregateType.User) { + storeNames.map(extractUserFeaturesMap) + } else None + }.toSeq + + private def getSecondaryKeyFilterPipeOpt( + aggregateType: AggregateType.Value + ): Option[TypedPipe[Long]] = { + if (aggregateType == AggregateType.UserAuthor) { + Some(usersActiveSourcePipe) + } else None + } + + def userSecondaryKeyPipes: Seq[TypedPipe[(Long, KeyedRecordMap)]] = { + storeRegister.allStores.flatMap { storeConfig => + val StoreConfig(storeNames, aggregateType, shouldHash) = storeConfig + require(storeMerger.isValidToMerge(storeNames)) + + if (aggregateType != AggregateType.User) { + storeNames.flatMap { storeName => + storeConfig.secondaryKeyFeatureOpt + .map { secondaryFeature => + extractSecondaryTuples( + storeName, + secondaryFeature, + shouldHash, + applyMaxSizeByTypeOpt(aggregateType), + getSecondaryKeyFilterPipeOpt(aggregateType) + ) + } + } + } else None + }.toSeq + } + + def joinedAggregates: TypedPipe[(Long, MergedRecordsDescriptor)] = { + (userPipes ++ userSecondaryKeyPipes) + .reduce(_ ++ _) + .group + .withReducers(numReducersMerge) + .mapGroup { + case (uid, keyedRecordsAndMaps) => + /* + * For every user, partition their records by aggregate type. + * AggregateType.User should only contain KeyedRecord whereas + * other aggregate types (with secondary keys) contain KeyedRecordMap. + */ + val (userRecords, userSecondaryKeyRecords) = keyedRecordsAndMaps.toList + .map { record => + record match { + case record: KeyedRecord => (record.aggregateType, record) + case record: KeyedRecordMap => (record.aggregateType, record) + } + } + .groupBy(_._1) + .mapValues(_.map(_._2)) + .partition(_._1 == AggregateType.User) + + val userAggregateRecordMap: Map[AggregateType.Value, Option[KeyedRecord]] = + userRecords + .asInstanceOf[Map[AggregateType.Value, List[KeyedRecord]]] + .map { + case (aggregateType, keyedRecords) => + val mergedKeyedRecordOpt = mergeKeyedRecordOpts(keyedRecords.map(Some(_)): _*) + (aggregateType, mergedKeyedRecordOpt) + } + + val userSecondaryKeyAggregateRecordOpt: Map[AggregateType.Value, Option[KeyedRecordMap]] = + userSecondaryKeyRecords + .asInstanceOf[Map[AggregateType.Value, List[KeyedRecordMap]]] + .map { + case (aggregateType, keyedRecordMaps) => + val keyedRecordMapOpt = + keyedRecordMaps.foldLeft(Option.empty[KeyedRecordMap]) { + (mergedRecOpt, nextRec) => + applyMaxSizeByTypeOpt(aggregateType) + .map { maxSize => + mergeKeyedRecordMapOpts(mergedRecOpt, Some(nextRec), maxSize) + }.getOrElse { + mergeKeyedRecordMapOpts(mergedRecOpt, Some(nextRec)) + } + } + (aggregateType, keyedRecordMapOpt) + } + + Iterator( + MergedRecordsDescriptor( + userId = uid, + keyedRecords = userAggregateRecordMap, + keyedRecordMaps = userSecondaryKeyAggregateRecordOpt + ) + ) + }.toTypedPipe + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/scalding/AggregatesStoreComparisonJob.scala b/timelines/data_processing/ml_util/aggregation_framework/scalding/AggregatesStoreComparisonJob.scala new file mode 100644 index 0000000000..054d5d4283 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/scalding/AggregatesStoreComparisonJob.scala @@ -0,0 +1,200 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.scalding + +import com.twitter.algebird.ScMapMonoid +import com.twitter.bijection.Injection +import com.twitter.bijection.thrift.CompactThriftCodec +import com.twitter.ml.api.util.CompactDataRecordConverter +import com.twitter.ml.api.CompactDataRecord +import com.twitter.ml.api.DataRecord +import com.twitter.scalding.commons.source.VersionedKeyValSource +import com.twitter.scalding.Args +import com.twitter.scalding.Days +import com.twitter.scalding.Duration +import com.twitter.scalding.RichDate +import com.twitter.scalding.TypedPipe +import com.twitter.scalding.TypedTsv +import com.twitter.scalding_internal.job.HasDateRange +import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchJob +import com.twitter.summingbird.batch.BatchID +import com.twitter.summingbird_internal.bijection.BatchPairImplicits +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKeyInjection +import java.lang.{Double => JDouble} +import java.lang.{Long => JLong} +import scala.collection.JavaConverters._ + +/** + * The job takes four inputs: + * - The path to a AggregateStore using the DataRecord format. + * - The path to a AggregateStore using the CompactDataRecord format. + * - A version that must be present in both sources. + * - A sink to write the comparison statistics. + * + * The job reads in the two stores, converts the second one to DataRecords and + * then compared each key to see if the two stores have identical DataRecords, + * modulo the loss in precision on converting the Double to Float. + */ +class AggregatesStoreComparisonJob(args: Args) + extends AnalyticsBatchJob(args) + with BatchPairImplicits + with HasDateRange { + + import AggregatesStoreComparisonJob._ + override def batchIncrement: Duration = Days(1) + override def firstTime: RichDate = RichDate(args("firstTime")) + + private val dataRecordSourcePath = args("dataRecordSource") + private val compactDataRecordSourcePath = args("compactDataRecordSource") + + private val version = args.long("version") + + private val statsSink = args("sink") + + require(dataRecordSourcePath != compactDataRecordSourcePath) + + private val dataRecordSource = + VersionedKeyValSource[AggregationKey, (BatchID, DataRecord)]( + path = dataRecordSourcePath, + sourceVersion = Some(version) + ) + private val compactDataRecordSource = + VersionedKeyValSource[AggregationKey, (BatchID, CompactDataRecord)]( + path = compactDataRecordSourcePath, + sourceVersion = Some(version) + ) + + private val dataRecordPipe: TypedPipe[((AggregationKey, BatchID), DataRecord)] = TypedPipe + .from(dataRecordSource) + .map { case (key, (batchId, record)) => ((key, batchId), record) } + + private val compactDataRecordPipe: TypedPipe[((AggregationKey, BatchID), DataRecord)] = TypedPipe + .from(compactDataRecordSource) + .map { + case (key, (batchId, compactRecord)) => + val record = compactConverter.compactDataRecordToDataRecord(compactRecord) + ((key, batchId), record) + } + + dataRecordPipe + .outerJoin(compactDataRecordPipe) + .mapValues { case (leftOpt, rightOpt) => compareDataRecords(leftOpt, rightOpt) } + .values + .sum(mapMonoid) + .flatMap(_.toList) + .write(TypedTsv(statsSink)) +} + +object AggregatesStoreComparisonJob { + + val mapMonoid: ScMapMonoid[String, Long] = new ScMapMonoid[String, Long]() + + implicit private val aggregationKeyInjection: Injection[AggregationKey, Array[Byte]] = + AggregationKeyInjection + implicit private val aggregationKeyOrdering: Ordering[AggregationKey] = AggregationKeyOrdering + implicit private val dataRecordCodec: Injection[DataRecord, Array[Byte]] = + CompactThriftCodec[DataRecord] + implicit private val compactDataRecordCodec: Injection[CompactDataRecord, Array[Byte]] = + CompactThriftCodec[CompactDataRecord] + + private val compactConverter = new CompactDataRecordConverter + + val missingRecordFromLeft = "missingRecordFromLeft" + val missingRecordFromRight = "missingRecordFromRight" + val nonContinuousFeaturesDidNotMatch = "nonContinuousFeaturesDidNotMatch" + val missingFeaturesFromLeft = "missingFeaturesFromLeft" + val missingFeaturesFromRight = "missingFeaturesFromRight" + val recordsWithUnmatchedKeys = "recordsWithUnmatchedKeys" + val featureValuesMatched = "featureValuesMatched" + val featureValuesThatDidNotMatch = "featureValuesThatDidNotMatch" + val equalRecords = "equalRecords" + val keyCount = "keyCount" + + def compareDataRecords( + leftOpt: Option[DataRecord], + rightOpt: Option[DataRecord] + ): collection.Map[String, Long] = { + val stats = collection.Map((keyCount, 1L)) + (leftOpt, rightOpt) match { + case (Some(left), Some(right)) => + if (isIdenticalNonContinuousFeatureSet(left, right)) { + getContinuousFeaturesStats(left, right).foldLeft(stats)(mapMonoid.add) + } else { + mapMonoid.add(stats, (nonContinuousFeaturesDidNotMatch, 1L)) + } + case (Some(_), None) => mapMonoid.add(stats, (missingRecordFromRight, 1L)) + case (None, Some(_)) => mapMonoid.add(stats, (missingRecordFromLeft, 1L)) + case (None, None) => throw new IllegalArgumentException("Should never be possible") + } + } + + /** + * For Continuous features. + */ + private def getContinuousFeaturesStats( + left: DataRecord, + right: DataRecord + ): Seq[(String, Long)] = { + val leftFeatures = Option(left.getContinuousFeatures) + .map(_.asScala.toMap) + .getOrElse(Map.empty[JLong, JDouble]) + + val rightFeatures = Option(right.getContinuousFeatures) + .map(_.asScala.toMap) + .getOrElse(Map.empty[JLong, JDouble]) + + val numMissingFeaturesLeft = (rightFeatures.keySet diff leftFeatures.keySet).size + val numMissingFeaturesRight = (leftFeatures.keySet diff rightFeatures.keySet).size + + if (numMissingFeaturesLeft == 0 && numMissingFeaturesRight == 0) { + val Epsilon = 1e-5 + val numUnmatchedValues = leftFeatures.map { + case (id, lValue) => + val rValue = rightFeatures(id) + // The approximate match is to account for the precision loss due to + // the Double -> Float -> Double conversion. + if (math.abs(lValue - rValue) <= Epsilon) 0L else 1L + }.sum + + if (numUnmatchedValues == 0) { + Seq( + (equalRecords, 1L), + (featureValuesMatched, leftFeatures.size.toLong) + ) + } else { + Seq( + (featureValuesThatDidNotMatch, numUnmatchedValues), + ( + featureValuesMatched, + math.max(leftFeatures.size, rightFeatures.size) - numUnmatchedValues) + ) + } + } else { + Seq( + (recordsWithUnmatchedKeys, 1L), + (missingFeaturesFromLeft, numMissingFeaturesLeft.toLong), + (missingFeaturesFromRight, numMissingFeaturesRight.toLong) + ) + } + } + + /** + * For feature types that are not Feature.Continuous. We expect these to match exactly in the two stores. + * Mutable change + */ + private def isIdenticalNonContinuousFeatureSet(left: DataRecord, right: DataRecord): Boolean = { + val booleanMatched = safeEquals(left.binaryFeatures, right.binaryFeatures) + val discreteMatched = safeEquals(left.discreteFeatures, right.discreteFeatures) + val stringMatched = safeEquals(left.stringFeatures, right.stringFeatures) + val sparseBinaryMatched = safeEquals(left.sparseBinaryFeatures, right.sparseBinaryFeatures) + val sparseContinuousMatched = + safeEquals(left.sparseContinuousFeatures, right.sparseContinuousFeatures) + val blobMatched = safeEquals(left.blobFeatures, right.blobFeatures) + val tensorsMatched = safeEquals(left.tensors, right.tensors) + val sparseTensorsMatched = safeEquals(left.sparseTensors, right.sparseTensors) + + booleanMatched && discreteMatched && stringMatched && sparseBinaryMatched && + sparseContinuousMatched && blobMatched && tensorsMatched && sparseTensorsMatched + } + + def safeEquals[T](l: T, r: T): Boolean = Option(l).equals(Option(r)) +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/scalding/AggregatesV2ScaldingJob.scala b/timelines/data_processing/ml_util/aggregation_framework/scalding/AggregatesV2ScaldingJob.scala new file mode 100644 index 0000000000..aa8ae3612e --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/scalding/AggregatesV2ScaldingJob.scala @@ -0,0 +1,216 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.scalding + +import com.twitter.bijection.thrift.CompactThriftCodec +import com.twitter.bijection.Codec +import com.twitter.bijection.Injection +import com.twitter.ml.api._ +import com.twitter.ml.api.constant.SharedFeatures.TIMESTAMP +import com.twitter.ml.api.util.CompactDataRecordConverter +import com.twitter.ml.api.util.SRichDataRecord +import com.twitter.scalding.Args +import com.twitter.scalding_internal.dalv2.DALWrite.D +import com.twitter.storehaus_internal.manhattan.ManhattanROConfig +import com.twitter.summingbird.batch.option.Reducers +import com.twitter.summingbird.batch.BatchID +import com.twitter.summingbird.batch.Batcher +import com.twitter.summingbird.batch.Timestamp +import com.twitter.summingbird.option._ +import com.twitter.summingbird.scalding.Scalding +import com.twitter.summingbird.scalding.batch.{BatchedStore => ScaldingBatchedStore} +import com.twitter.summingbird.Options +import com.twitter.summingbird.Producer +import com.twitter.summingbird_internal.bijection.BatchPairImplicits._ +import com.twitter.summingbird_internal.runner.common.JobName +import com.twitter.summingbird_internal.runner.scalding.GenericRunner +import com.twitter.summingbird_internal.runner.scalding.ScaldingConfig +import com.twitter.summingbird_internal.runner.scalding.StatebirdState +import com.twitter.summingbird_internal.dalv2.DAL +import com.twitter.summingbird_internal.runner.store_config._ +import com.twitter.timelines.data_processing.ml_util.aggregation_framework._ +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.scalding.sources._ +import job.AggregatesV2Job +import org.apache.hadoop.conf.Configuration +/* + * Offline scalding version of summingbird job to compute aggregates v2. + * This is loosely based on the template created by sb-gen. + * Extend this trait in your own scalding job, and override the val + * "aggregatesToCompute" with your own desired set of aggregates. + */ +trait AggregatesV2ScaldingJob { + val aggregatesToCompute: Set[TypedAggregateGroup[_]] + + implicit val aggregationKeyInjection: Injection[AggregationKey, Array[Byte]] = + AggregationKeyInjection + + implicit val aggregationKeyOrdering: AggregationKeyOrdering.type = AggregationKeyOrdering + + implicit val dataRecordCodec: Injection[DataRecord, Array[Byte]] = CompactThriftCodec[DataRecord] + + private implicit val compactDataRecordCodec: Injection[CompactDataRecord, Array[Byte]] = + CompactThriftCodec[CompactDataRecord] + + private val compactDataRecordConverter = new CompactDataRecordConverter() + + def numReducers: Int = -1 + + /** + * Function that maps from a logical ''AggregateSource'' + * to an underlying physical source. The physical source + * for the scalding platform is a ScaldingAggregateSource. + */ + def dataRecordSourceToScalding( + source: AggregateSource + ): Option[Producer[Scalding, DataRecord]] = { + source match { + case offlineSource: OfflineAggregateSource => + Some(ScaldingAggregateSource(offlineSource).source) + case _ => None + } + } + + /** + * Creates and returns a versioned store using the config parameters + * with a specific number of versions to keep, and which can read from + * the most recent available version on HDFS rather than a specific + * version number. The store applies a timestamp correction based on the + * number of days of aggregate data skipped over at read time to ensure + * that skipping data plays nicely with halfLife decay. + * + * @param config specifying the Manhattan store parameters + * @param versionsToKeep number of old versions to keep + */ + def getMostRecentLagCorrectingVersionedStoreWithRetention[ + Key: Codec: Ordering, + ValInStore: Codec, + ValInMemory + ]( + config: OfflineStoreOnlyConfig[ManhattanROConfig], + versionsToKeep: Int, + lagCorrector: (ValInMemory, Long) => ValInMemory, + packer: ValInMemory => ValInStore, + unpacker: ValInStore => ValInMemory + ): ScaldingBatchedStore[Key, ValInMemory] = { + MostRecentLagCorrectingVersionedStore[Key, ValInStore, ValInMemory]( + config.offline.hdfsPath.toString, + packer = packer, + unpacker = unpacker, + versionsToKeep = versionsToKeep)( + Injection.connect[(Key, (BatchID, ValInStore)), (Array[Byte], Array[Byte])], + config.batcher, + implicitly[Ordering[Key]], + lagCorrector + ).withInitialBatch(config.batcher.batchOf(config.startTime.value)) + } + + def mutablyCorrectDataRecordTimestamp( + record: DataRecord, + lagToCorrectMillis: Long + ): DataRecord = { + val richRecord = SRichDataRecord(record) + if (richRecord.hasFeature(TIMESTAMP)) { + val timestamp = richRecord.getFeatureValue(TIMESTAMP).toLong + richRecord.setFeatureValue(TIMESTAMP, timestamp + lagToCorrectMillis) + } + record + } + + /** + * Function that maps from a logical ''AggregateStore'' + * to an underlying physical store. The physical store for + * scalding is a HDFS VersionedKeyValSource dataset. + */ + def aggregateStoreToScalding( + store: AggregateStore + ): Option[Scalding#Store[AggregationKey, DataRecord]] = { + store match { + case offlineStore: OfflineAggregateDataRecordStore => + Some( + getMostRecentLagCorrectingVersionedStoreWithRetention[ + AggregationKey, + DataRecord, + DataRecord]( + offlineStore, + versionsToKeep = offlineStore.batchesToKeep, + lagCorrector = mutablyCorrectDataRecordTimestamp, + packer = Injection.identity[DataRecord], + unpacker = Injection.identity[DataRecord] + ) + ) + case offlineStore: OfflineAggregateDataRecordStoreWithDAL => + Some( + DAL.versionedKeyValStore[AggregationKey, DataRecord]( + dataset = offlineStore.dalDataset, + pathLayout = D.Suffix(offlineStore.offline.hdfsPath.toString), + batcher = offlineStore.batcher, + maybeStartTime = Some(offlineStore.startTime), + maxErrors = offlineStore.maxKvSourceFailures + )) + case _ => None + } + } + + def generate(args: Args): ScaldingConfig = new ScaldingConfig { + val jobName = JobName(args("job_name")) + + /* + * Add registrars for chill serialization for user-defined types. + * We use the default: an empty List(). + */ + override def registrars = List() + + /* Use transformConfig to set Hadoop options. */ + override def transformConfig(config: Map[String, AnyRef]): Map[String, AnyRef] = + super.transformConfig(config) ++ Map( + "mapreduce.output.fileoutputformat.compress" -> "true", + "mapreduce.output.fileoutputformat.compress.codec" -> "com.hadoop.compression.lzo.LzoCodec", + "mapreduce.output.fileoutputformat.compress.type" -> "BLOCK" + ) + + /* + * Use getNamedOptions to set Summingbird runtime options + * The options we set are: + * 1) Set monoid to non-commutative to disable map-side + * aggregation and force all aggregation to reducers (provides a 20% speedup) + */ + override def getNamedOptions: Map[String, Options] = Map( + "DEFAULT" -> Options() + .set(MonoidIsCommutative(false)) + .set(Reducers(numReducers)) + ) + + implicit val batcher: Batcher = Batcher.ofHours(24) + + /* State implementation that uses Statebird (go/statebird) to track the batches processed. */ + def getWaitingState(hadoopConfig: Configuration, startDate: Option[Timestamp], batches: Int) = + StatebirdState( + jobName, + startDate, + batches, + args.optional("statebird_service_destination"), + args.optional("statebird_client_id_name") + )(batcher) + + val sourceNameFilter: Option[Set[String]] = + args.optional("input_sources").map(_.split(",").toSet) + val storeNameFilter: Option[Set[String]] = + args.optional("output_stores").map(_.split(",").toSet) + + val filteredAggregates = + AggregatesV2Job.filterAggregates( + aggregates = aggregatesToCompute, + sourceNames = sourceNameFilter, + storeNames = storeNameFilter + ) + + override val graph = + AggregatesV2Job.generateJobGraph[Scalding]( + filteredAggregates, + dataRecordSourceToScalding, + aggregateStoreToScalding + )(DataRecordAggregationMonoid(filteredAggregates)) + } + def main(args: Array[String]): Unit = { + GenericRunner(args, generate(_)) + + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/scalding/AggregationKeyOrdering.scala b/timelines/data_processing/ml_util/aggregation_framework/scalding/AggregationKeyOrdering.scala new file mode 100644 index 0000000000..af6f14ff23 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/scalding/AggregationKeyOrdering.scala @@ -0,0 +1,17 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.scalding + +import com.twitter.scalding_internal.job.RequiredBinaryComparators.ordSer +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey +import com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.MacroEqualityOrderedSerialization + +object AggregationKeyOrdering extends Ordering[AggregationKey] { + implicit val featureMapsOrdering: MacroEqualityOrderedSerialization[ + (Map[Long, Long], Map[Long, String]) + ] = ordSer[(Map[Long, Long], Map[Long, String])] + + override def compare(left: AggregationKey, right: AggregationKey): Int = + featureMapsOrdering.compare( + AggregationKey.unapply(left).get, + AggregationKey.unapply(right).get + ) +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/scalding/BUILD b/timelines/data_processing/ml_util/aggregation_framework/scalding/BUILD new file mode 100644 index 0000000000..d037666197 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/scalding/BUILD @@ -0,0 +1,72 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "3rdparty/jvm/com/twitter/bijection:core", + "3rdparty/jvm/com/twitter/bijection:json", + "3rdparty/jvm/com/twitter/bijection:netty", + "3rdparty/jvm/com/twitter/bijection:scrooge", + "3rdparty/jvm/com/twitter/bijection:thrift", + "3rdparty/jvm/com/twitter/bijection:util", + "3rdparty/jvm/com/twitter/chill:bijection", + "3rdparty/jvm/com/twitter/storehaus:algebra", + "3rdparty/jvm/com/twitter/storehaus:core", + "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", + "3rdparty/src/jvm/com/twitter/scalding:args", + "3rdparty/src/jvm/com/twitter/scalding:commons", + "3rdparty/src/jvm/com/twitter/scalding:core", + "3rdparty/src/jvm/com/twitter/summingbird:batch", + "3rdparty/src/jvm/com/twitter/summingbird:batch-hadoop", + "3rdparty/src/jvm/com/twitter/summingbird:chill", + "3rdparty/src/jvm/com/twitter/summingbird:core", + "3rdparty/src/jvm/com/twitter/summingbird:scalding", + "finagle/finagle-core/src/main", + "gizmoduck/snapshot/src/main/scala/com/twitter/gizmoduck/snapshot:deleted_user-scala", + "src/java/com/twitter/ml/api:api-base", + "src/java/com/twitter/ml/api/constant", + "src/scala/com/twitter/ml/api/util", + "src/scala/com/twitter/scalding_internal/dalv2", + "src/scala/com/twitter/scalding_internal/job/analytics_batch", + "src/scala/com/twitter/scalding_internal/util", + "src/scala/com/twitter/storehaus_internal/manhattan/config", + "src/scala/com/twitter/storehaus_internal/offline", + "src/scala/com/twitter/storehaus_internal/util", + "src/scala/com/twitter/summingbird_internal/bijection", + "src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits", + "src/scala/com/twitter/summingbird_internal/dalv2", + "src/scala/com/twitter/summingbird_internal/runner/common", + "src/scala/com/twitter/summingbird_internal/runner/scalding", + "src/scala/com/twitter/summingbird_internal/runner/store_config", + "src/scala/com/twitter/summingbird_internal/runner/store_config/versioned_store", + "src/scala/com/twitter/summingbird_internal/sources/common", + "src/thrift/com/twitter/ml/api:data-java", + "src/thrift/com/twitter/ml/api:interpretable-model-java", + "src/thrift/com/twitter/statebird:compiled-v2-java", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + "timelines/data_processing/ml_util/aggregation_framework:user_job", + "timelines/data_processing/ml_util/aggregation_framework/scalding/sources", + "timelines/data_processing/ml_util/sampling:sampling_utils", + ], + exports = [ + "3rdparty/src/jvm/com/twitter/summingbird:scalding", + "src/scala/com/twitter/storehaus_internal/manhattan/config", + "src/scala/com/twitter/summingbird_internal/runner/store_config", + ], +) + +hadoop_binary( + name = "bin", + basename = "aggregation_framework_scalding-deploy", + main = "com.twitter.scalding.Tool", + platform = "java8", + runtime_platform = "java8", + tags = [ + "bazel-compatible", + "bazel-compatible:migrated", + "bazel-only", + ], + dependencies = [ + ":scalding", + ], +) diff --git a/timelines/data_processing/ml_util/aggregation_framework/scalding/DeletedUserPruner.scala b/timelines/data_processing/ml_util/aggregation_framework/scalding/DeletedUserPruner.scala new file mode 100644 index 0000000000..7e2f7a95ca --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/scalding/DeletedUserPruner.scala @@ -0,0 +1,97 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.scalding + +import com.twitter.gizmoduck.snapshot.DeletedUserScalaDataset +import com.twitter.ml.api.DataRecord +import com.twitter.ml.api.Feature +import com.twitter.scalding.typed.TypedPipe +import com.twitter.scalding.DateOps +import com.twitter.scalding.DateRange +import com.twitter.scalding.Days +import com.twitter.scalding.RichDate +import com.twitter.scalding_internal.dalv2.DAL +import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC +import com.twitter.scalding_internal.job.RequiredBinaryComparators.ordSer +import com.twitter.scalding_internal.pruner.Pruner +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup +import com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.MacroEqualityOrderedSerialization +import java.{util => ju} + +object DeletedUserSeqPruner extends Pruner[Seq[Long]] { + implicit val tz: ju.TimeZone = DateOps.UTC + implicit val userIdSequenceOrdering: MacroEqualityOrderedSerialization[Seq[Long]] = + ordSer[Seq[Long]] + + private[scalding] def pruneDeletedUsers[T]( + input: TypedPipe[T], + extractor: T => Seq[Long], + deletedUsers: TypedPipe[Long] + ): TypedPipe[T] = { + val userIdsAndValues = input.map { t: T => + val userIds: Seq[Long] = extractor(t) + (userIds, t) + } + + // Find all valid sequences of userids in the input pipe + // that contain at least one deleted user. This is efficient + // as long as the number of deleted users is small. + val userSequencesWithDeletedUsers = userIdsAndValues + .flatMap { case (userIds, _) => userIds.map((_, userIds)) } + .leftJoin(deletedUsers.asKeys) + .collect { case (_, (userIds, Some(_))) => userIds } + .distinct + + userIdsAndValues + .leftJoin(userSequencesWithDeletedUsers.asKeys) + .collect { case (_, (t, None)) => t } + } + + override def prune[T]( + input: TypedPipe[T], + put: (T, Seq[Long]) => Option[T], + get: T => Seq[Long], + writeTime: RichDate + ): TypedPipe[T] = { + lazy val deletedUsers = DAL + .readMostRecentSnapshot(DeletedUserScalaDataset, DateRange(writeTime - Days(7), writeTime)) + .withRemoteReadPolicy(AllowCrossClusterSameDC) + .toTypedPipe + .map(_.userId) + + pruneDeletedUsers(input, get, deletedUsers) + } +} + +object AggregationKeyPruner { + + /** + * Makes a pruner that prunes aggregate records where any of the + * "userIdFeatures" set in the aggregation key correspond to a + * user who has deleted their account. Here, "userIdFeatures" is + * intended as a catch-all term for all features corresponding to + * a Twitter user in the input data record -- the feature itself + * could represent an authorId, retweeterId, engagerId, etc. + */ + def mkDeletedUsersPruner( + userIdFeatures: Seq[Feature[_]] + ): Pruner[(AggregationKey, DataRecord)] = { + val userIdFeatureIds = userIdFeatures.map(TypedAggregateGroup.getDenseFeatureId) + + def getter(tupled: (AggregationKey, DataRecord)): Seq[Long] = { + tupled match { + case (aggregationKey, _) => + userIdFeatureIds.flatMap { id => + aggregationKey.discreteFeaturesById + .get(id) + .orElse(aggregationKey.textFeaturesById.get(id).map(_.toLong)) + } + } + } + + // Setting putter to always return None here. The put function is not used within pruneDeletedUsers, this function is just needed for xmap api. + def putter: ((AggregationKey, DataRecord), Seq[Long]) => Option[(AggregationKey, DataRecord)] = + (t, seq) => None + + DeletedUserSeqPruner.xmap(putter, getter) + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/scalding/MostRecentVersionedStore.scala b/timelines/data_processing/ml_util/aggregation_framework/scalding/MostRecentVersionedStore.scala new file mode 100644 index 0000000000..d60e677167 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/scalding/MostRecentVersionedStore.scala @@ -0,0 +1,100 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.scalding + +import com.twitter.bijection.Injection +import com.twitter.scalding.commons.source.VersionedKeyValSource +import com.twitter.scalding.TypedPipe +import com.twitter.scalding.{Hdfs => HdfsMode} +import com.twitter.summingbird.batch.store.HDFSMetadata +import com.twitter.summingbird.batch.BatchID +import com.twitter.summingbird.batch.Batcher +import com.twitter.summingbird.batch.OrderedFromOrderingExt +import com.twitter.summingbird.batch.PrunedSpace +import com.twitter.summingbird.scalding._ +import com.twitter.summingbird.scalding.store.VersionedBatchStore +import org.slf4j.LoggerFactory + +object MostRecentLagCorrectingVersionedStore { + def apply[Key, ValInStore, ValInMemory]( + rootPath: String, + packer: ValInMemory => ValInStore, + unpacker: ValInStore => ValInMemory, + versionsToKeep: Int = VersionedKeyValSource.defaultVersionsToKeep, + prunedSpace: PrunedSpace[(Key, ValInMemory)] = PrunedSpace.neverPruned + )( + implicit injection: Injection[(Key, (BatchID, ValInStore)), (Array[Byte], Array[Byte])], + batcher: Batcher, + ord: Ordering[Key], + lagCorrector: (ValInMemory, Long) => ValInMemory + ): MostRecentLagCorrectingVersionedBatchStore[Key, ValInMemory, Key, (BatchID, ValInStore)] = { + new MostRecentLagCorrectingVersionedBatchStore[Key, ValInMemory, Key, (BatchID, ValInStore)]( + rootPath, + versionsToKeep, + batcher + )(lagCorrector)({ case (batchID, (k, v)) => (k, (batchID.next, packer(v))) })({ + case (k, (_, v)) => (k, unpacker(v)) + }) { + override def select(b: List[BatchID]) = List(b.last) + override def pruning: PrunedSpace[(Key, ValInMemory)] = prunedSpace + } + } +} + +/** + * @param lagCorrector lagCorrector allows one to take data from one batch and pretend as if it + * came from a different batch. + * @param pack Converts the in-memory tuples to the type used by the underlying key-val store. + * @param unpack Converts the key-val tuples from the store in the form used by the calling object. + */ +class MostRecentLagCorrectingVersionedBatchStore[KeyInMemory, ValInMemory, KeyInStore, ValInStore]( + rootPath: String, + versionsToKeep: Int, + override val batcher: Batcher +)( + lagCorrector: (ValInMemory, Long) => ValInMemory +)( + pack: (BatchID, (KeyInMemory, ValInMemory)) => (KeyInStore, ValInStore) +)( + unpack: ((KeyInStore, ValInStore)) => (KeyInMemory, ValInMemory) +)( + implicit @transient injection: Injection[(KeyInStore, ValInStore), (Array[Byte], Array[Byte])], + override val ordering: Ordering[KeyInMemory]) + extends VersionedBatchStore[KeyInMemory, ValInMemory, KeyInStore, ValInStore]( + rootPath, + versionsToKeep, + batcher)(pack)(unpack)(injection, ordering) { + + import OrderedFromOrderingExt._ + + @transient private val logger = + LoggerFactory.getLogger(classOf[MostRecentLagCorrectingVersionedBatchStore[_, _, _, _]]) + + override protected def lastBatch( + exclusiveUB: BatchID, + mode: HdfsMode + ): Option[(BatchID, FlowProducer[TypedPipe[(KeyInMemory, ValInMemory)]])] = { + val batchToPretendAs = exclusiveUB.prev + val versionToPretendAs = batchIDToVersion(batchToPretendAs) + logger.info( + s"Most recent lag correcting versioned batched store at $rootPath entering lastBatch method versionToPretendAs = $versionToPretendAs") + val meta = new HDFSMetadata(mode.conf, rootPath) + meta.versions + .map { ver => (versionToBatchID(ver), readVersion(ver)) } + .filter { _._1 < exclusiveUB } + .reduceOption { (a, b) => if (a._1 > b._1) a else b } + .map { + case ( + lastBatchID: BatchID, + flowProducer: FlowProducer[TypedPipe[(KeyInMemory, ValInMemory)]]) => + val lastVersion = batchIDToVersion(lastBatchID) + val lagToCorrectMillis: Long = + batchIDToVersion(batchToPretendAs) - batchIDToVersion(lastBatchID) + logger.info( + s"Most recent available version is $lastVersion, so lagToCorrectMillis is $lagToCorrectMillis") + val lagCorrectedFlowProducer = flowProducer.map { + pipe: TypedPipe[(KeyInMemory, ValInMemory)] => + pipe.map { case (k, v) => (k, lagCorrector(v, lagToCorrectMillis)) } + } + (batchToPretendAs, lagCorrectedFlowProducer) + } + } +} diff --git a/timelines/data_processing/ml_util/aggregation_framework/scalding/sources/BUILD b/timelines/data_processing/ml_util/aggregation_framework/scalding/sources/BUILD new file mode 100644 index 0000000000..ba065ecd76 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/scalding/sources/BUILD @@ -0,0 +1,26 @@ +scala_library( + sources = ["*.scala"], + platform = "java8", + tags = ["bazel-compatible"], + dependencies = [ + "3rdparty/jvm/com/twitter/storehaus:algebra", + "3rdparty/src/jvm/com/twitter/scalding:commons", + "3rdparty/src/jvm/com/twitter/scalding:core", + "3rdparty/src/jvm/com/twitter/scalding:date", + "3rdparty/src/jvm/com/twitter/summingbird:batch", + "3rdparty/src/jvm/com/twitter/summingbird:batch-hadoop", + "3rdparty/src/jvm/com/twitter/summingbird:chill", + "3rdparty/src/jvm/com/twitter/summingbird:core", + "3rdparty/src/jvm/com/twitter/summingbird:scalding", + "src/java/com/twitter/ml/api:api-base", + "src/scala/com/twitter/ml/api:api-base", + "src/scala/com/twitter/ml/api/internal", + "src/scala/com/twitter/ml/api/util", + "src/scala/com/twitter/scalding_internal/dalv2", + "src/scala/com/twitter/scalding_internal/dalv2/remote_access", + "src/scala/com/twitter/summingbird_internal/sources/common", + "src/thrift/com/twitter/ml/api:data-java", + "src/thrift/com/twitter/ml/api:interpretable-model-java", + "timelines/data_processing/ml_util/aggregation_framework:common_types", + ], +) diff --git a/timelines/data_processing/ml_util/aggregation_framework/scalding/sources/ScaldingAggregateSource.scala b/timelines/data_processing/ml_util/aggregation_framework/scalding/sources/ScaldingAggregateSource.scala new file mode 100644 index 0000000000..d1820b4fc1 --- /dev/null +++ b/timelines/data_processing/ml_util/aggregation_framework/scalding/sources/ScaldingAggregateSource.scala @@ -0,0 +1,77 @@ +package com.twitter.timelines.data_processing.ml_util.aggregation_framework.scalding.sources + +import com.twitter.ml.api.DailySuffixFeatureSource +import com.twitter.ml.api.DataRecord +import com.twitter.ml.api.FixedPathFeatureSource +import com.twitter.ml.api.HourlySuffixFeatureSource +import com.twitter.ml.api.util.SRichDataRecord +import com.twitter.scalding._ +import com.twitter.scalding_internal.dalv2.DAL +import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC +import com.twitter.statebird.v2.thriftscala.Environment +import com.twitter.summingbird._ +import com.twitter.summingbird.scalding.Scalding.pipeFactoryExact +import com.twitter.summingbird.scalding._ +import com.twitter.summingbird_internal.sources.SourceFactory +import com.twitter.timelines.data_processing.ml_util.aggregation_framework.OfflineAggregateSource +import java.lang.{Long => JLong} + +/* + * Summingbird offline HDFS source that reads from data records on HDFS. + * + * @param offlineSource Underlying offline source that contains + * all the config info to build this platform-specific (scalding) source. + */ +case class ScaldingAggregateSource(offlineSource: OfflineAggregateSource) + extends SourceFactory[Scalding, DataRecord] { + + val hdfsPath: String = offlineSource.scaldingHdfsPath.getOrElse("") + val suffixType: String = offlineSource.scaldingSuffixType.getOrElse("daily") + val withValidation: Boolean = offlineSource.withValidation + def name: String = offlineSource.name + def description: String = + "Summingbird offline source that reads from data records at: " + hdfsPath + + implicit val timeExtractor: TimeExtractor[DataRecord] = TimeExtractor((record: DataRecord) => + SRichDataRecord(record).getFeatureValue[JLong, JLong](offlineSource.timestampFeature)) + + def getSourceForDateRange(dateRange: DateRange) = { + suffixType match { + case "daily" => DailySuffixFeatureSource(hdfsPath)(dateRange).source + case "hourly" => HourlySuffixFeatureSource(hdfsPath)(dateRange).source + case "fixed_path" => FixedPathFeatureSource(hdfsPath).source + case "dal" => + offlineSource.dalDataSet match { + case Some(dataset) => + DAL + .read(dataset, dateRange) + .withRemoteReadPolicy(AllowCrossClusterSameDC) + .withEnvironment(Environment.Prod) + .toTypedSource + case _ => + throw new IllegalArgumentException( + "cannot provide an empty dataset when defining DAL as the suffix type" + ) + } + } + } + + /** + * This method is similar to [[Scalding.sourceFromMappable]] except that this uses [[pipeFactoryExact]] + * instead of [[pipeFactory]]. [[pipeFactoryExact]] also invokes [[FileSource.validateTaps]] on the source. + * The validation ensures the presence of _SUCCESS file before processing. For more details, please refer to + * https://jira.twitter.biz/browse/TQ-10618 + */ + def sourceFromMappableWithValidation[T: TimeExtractor: Manifest]( + factory: (DateRange) => Mappable[T] + ): Producer[Scalding, T] = { + Producer.source[Scalding, T](pipeFactoryExact(factory)) + } + + def source: Producer[Scalding, DataRecord] = { + if (withValidation) + sourceFromMappableWithValidation(getSourceForDateRange) + else + Scalding.sourceFromMappable(getSourceForDateRange) + } +}