mirror of
https://github.com/twitter/the-algorithm.git
synced 2025-01-10 15:05:36 +00:00
160 lines
5.1 KiB
Rust
160 lines
5.1 KiB
Rust
|
use std::fs;
|
||
|
use log::{debug};
|
||
|
|
||
|
use serde_json::{Value, Map};
|
||
|
|
||
|
use crate::error::SegDenseError;
|
||
|
use crate::mapper::{FeatureMapper, FeatureInfo, MapWriter};
|
||
|
use crate::segdense_transform_spec_home_recap_2022::{self as seg_dense, InputFeature};
|
||
|
|
||
|
pub fn load_config(file_name: &str) -> seg_dense::Root {
|
||
|
let json_str = fs::read_to_string(file_name).expect(
|
||
|
&format!("Unable to load segdense file {}", file_name));
|
||
|
let seg_dense_config = parse(&json_str).expect(
|
||
|
&format!("Unable to parse segdense file {}", file_name));
|
||
|
return seg_dense_config;
|
||
|
}
|
||
|
|
||
|
pub fn parse(json_str: &str) -> Result<seg_dense::Root, SegDenseError> {
|
||
|
let root: seg_dense::Root = serde_json::from_str(json_str)?;
|
||
|
return Ok(root);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Given a json string containing a seg dense schema create a feature mapper
|
||
|
* which is essentially:
|
||
|
*
|
||
|
* {feature-id -> (Tensor Index, Index of feature within the tensor)}
|
||
|
*
|
||
|
* Feature id : 64 bit hash of the feature name used in DataRecords.
|
||
|
*
|
||
|
* Tensor Index : A vector of tensors is passed to the model. Tensor
|
||
|
* index refers to the tensor this feature is part of.
|
||
|
*
|
||
|
* Index of feature in tensor : The tensors are vectors, the index of
|
||
|
* feature is the position to put the feature value.
|
||
|
*
|
||
|
* There are many assumptions made in this function that is very model specific.
|
||
|
* These assumptions are called out below and need to be schematized eventually.
|
||
|
*
|
||
|
* Call this once for each segdense schema and cache the FeatureMapper.
|
||
|
*/
|
||
|
pub fn safe_load_config(json_str: &str) -> Result<FeatureMapper, SegDenseError> {
|
||
|
let root = parse(json_str)?;
|
||
|
load_from_parsed_config(root)
|
||
|
}
|
||
|
|
||
|
pub fn load_from_parsed_config_ref(root: &seg_dense::Root) -> FeatureMapper {
|
||
|
load_from_parsed_config(root.clone()).unwrap_or_else(
|
||
|
|error| panic!("Error loading all_config.json - {}", error))
|
||
|
}
|
||
|
|
||
|
// Perf note : make 'root' un-owned
|
||
|
pub fn load_from_parsed_config(root: seg_dense::Root) ->
|
||
|
Result<FeatureMapper, SegDenseError> {
|
||
|
|
||
|
let v = root.input_features_map;
|
||
|
|
||
|
// Do error check
|
||
|
let map: Map<String, Value> = match v {
|
||
|
Value::Object(map) => map,
|
||
|
_ => return Err(SegDenseError::JsonMissingObject),
|
||
|
};
|
||
|
|
||
|
let mut fm: FeatureMapper = FeatureMapper::new();
|
||
|
|
||
|
let items = map.values();
|
||
|
|
||
|
// Perf : Consider a way to avoid clone here
|
||
|
for item in items.cloned() {
|
||
|
let mut vec = match item {
|
||
|
Value::Array(v) => v,
|
||
|
_ => return Err(SegDenseError::JsonMissingArray),
|
||
|
};
|
||
|
|
||
|
if vec.len() != 1 {
|
||
|
return Err(SegDenseError::JsonArraySize);
|
||
|
}
|
||
|
|
||
|
let val = vec.pop().unwrap();
|
||
|
|
||
|
let input_feature: seg_dense::InputFeature = serde_json::from_value(val)?;
|
||
|
let feature_id = input_feature.feature_id;
|
||
|
let feature_info = to_feature_info(&input_feature);
|
||
|
|
||
|
match feature_info {
|
||
|
Some(info) => {
|
||
|
debug!("{:?}", info);
|
||
|
fm.set(feature_id, info)
|
||
|
},
|
||
|
None => (),
|
||
|
}
|
||
|
}
|
||
|
|
||
|
Ok(fm)
|
||
|
}
|
||
|
#[allow(dead_code)]
|
||
|
fn add_feature_info_to_mapper(feature_mapper: &mut FeatureMapper, input_features: &Vec<InputFeature>) {
|
||
|
for input_feature in input_features.iter() {
|
||
|
let feature_id = input_feature.feature_id;
|
||
|
let feature_info = to_feature_info(input_feature);
|
||
|
|
||
|
match feature_info {
|
||
|
Some(info) => {
|
||
|
debug!("{:?}", info);
|
||
|
feature_mapper.set(feature_id, info)
|
||
|
},
|
||
|
None => (),
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
pub fn to_feature_info(input_feature: &seg_dense::InputFeature) -> Option<FeatureInfo> {
|
||
|
if input_feature.maybe_exclude {
|
||
|
return None;
|
||
|
}
|
||
|
|
||
|
// This part needs to be schema driven
|
||
|
//
|
||
|
// tensor index : Which of these tensors this feature is part of
|
||
|
// [Continious, Binary, Discrete, User_embedding, user_eng_embedding, author_embedding]
|
||
|
// Note that this order is fixed/hardcoded here, and need to be schematized
|
||
|
//
|
||
|
let tensor_idx: i8 = match input_feature.feature_id {
|
||
|
// user.timelines.twhin_user_follow_embeddings.twhin_user_follow_embeddings
|
||
|
// Feature name is mapped to a feature-id value. The hardcoded values below correspond to a specific feature name.
|
||
|
-2550691008059411095 => 3,
|
||
|
|
||
|
// user.timelines.twhin_user_engagement_embeddings.twhin_user_engagement_embeddings
|
||
|
5390650078733277231 => 4,
|
||
|
|
||
|
// original_author.timelines.twhin_author_follow_embeddings.twhin_author_follow_embeddings
|
||
|
3223956748566688423 => 5,
|
||
|
|
||
|
_ => match input_feature.feature_type {
|
||
|
// feature_type : src/thrift/com/twitter/ml/api/data.thrift
|
||
|
// BINARY = 1, CONTINUOUS = 2, DISCRETE = 3,
|
||
|
// Map to slots in [Continious, Binary, Discrete, ..]
|
||
|
1 => 1,
|
||
|
2 => 0,
|
||
|
3 => 2,
|
||
|
_ => -1,
|
||
|
}
|
||
|
};
|
||
|
|
||
|
if input_feature.index < 0 {
|
||
|
return None;
|
||
|
}
|
||
|
|
||
|
// Handle this case later
|
||
|
if tensor_idx == -1 {
|
||
|
return None;
|
||
|
}
|
||
|
|
||
|
Some(FeatureInfo {
|
||
|
tensor_index: tensor_idx,
|
||
|
index_within_tensor: input_feature.index,
|
||
|
})
|
||
|
}
|
||
|
|