mirror of
https://github.com/twitter/the-algorithm.git
synced 2025-01-09 22:05:51 +00:00
Open-sourcing Tweetypie
Tweetypie is the core Tweet service that handles the reading and writing of Tweet data.
This commit is contained in:
parent
90d7ea370e
commit
01dbfee4c0
README.md
tweetypie/common/src/scala/com/twitter/tweetypie
additionalfields
caching
BUILDCacheOperations.scalaCacheResult.scalaExpiry.scalaServoCachedValueSerializer.scalaSoftTtl.scalaStitchAsync.scalaStitchCacheOperations.scalaStitchCaching.scalaValueSerializer.scala
client_id
context
decider
jiminy/tweetypie
matching
media
storage
AddTweetHandler.scalaBUILDBounceDeleteHandler.scalaCodecs.scalaDeleteAdditionalFieldsHandler.scalaField.scalaGetDeletedTweetsHandler.scalaGetStoredTweetHandler.scalaGetTweetHandler.scalaHardDeleteTweetHandler.scalaInspectFields.scalaJson.scalaManhattanOperations.scalaManhattanTweetStorageClient.scalaResponse.scalaScribe.scalaScrubHandler.scalaSoftDeleteHandler.scalaStats.scalaStatusConversions.scalaStorageConversions.scalaTimestampDecoder.scalaTweetKey.scalaTweetStateRecord.scalaTweetStorageClient.scalaTweetStorageException.scalaTweetUtils.scalaUndeleteHandler.scalaUpdateTweetHandler.scalapackage.scala
tflock
thriftscala
tweettext
BUILDGraphemeIndexIterator.scalaIndexConverter.scalaOffset.scalaPartialHtmlEncoding.scalaPreprocessor.scalaTextEntity.scalaTextModification.scalaTruncator.scalaTweetText.scala
util
|
@ -24,6 +24,7 @@ Product surfaces at Twitter are built on a shared set of data, models, and softw
|
|||
| | [timelines-aggregation-framework](timelines/data_processing/ml_util/aggregation_framework/README.md) | Framework for generating aggregate features in batch or real time. |
|
||||
| | [representation-manager](representation-manager/README.md) | Service to retrieve embeddings (i.e. SimClusers and TwHIN). |
|
||||
| | [twml](twml/README.md) | Legacy machine learning framework built on TensorFlow v1. |
|
||||
| | [Tweetypie](tweetypie/server/README.md) | Core Tweet service that handles the reading and writing of Tweet data. |
|
||||
|
||||
The product surface currently included in this repository is the For You Timeline.
|
||||
|
||||
|
|
|
@ -0,0 +1,118 @@
|
|||
package com.twitter.tweetypie.additionalfields
|
||||
|
||||
import com.twitter.tweetypie.thriftscala.Tweet
|
||||
import com.twitter.scrooge.TFieldBlob
|
||||
import com.twitter.scrooge.ThriftStructField
|
||||
|
||||
object AdditionalFields {
|
||||
type FieldId = Short
|
||||
|
||||
/** additional fields really start at 100, be we are ignoring conversation id for now */
|
||||
val StartAdditionalId = 101
|
||||
|
||||
/** all known [[Tweet]] field IDs */
|
||||
val CompiledFieldIds: Seq[FieldId] = Tweet.metaData.fields.map(_.id)
|
||||
|
||||
/** all known [[Tweet]] fields in the "additional-field" range (excludes id) */
|
||||
val CompiledAdditionalFieldMetaDatas: Seq[ThriftStructField[Tweet]] =
|
||||
Tweet.metaData.fields.filter(f => isAdditionalFieldId(f.id))
|
||||
|
||||
val CompiledAdditionalFieldsMap: Map[Short, ThriftStructField[Tweet]] =
|
||||
CompiledAdditionalFieldMetaDatas.map(field => (field.id, field)).toMap
|
||||
|
||||
/** all known [[Tweet]] field IDs in the "additional-field" range */
|
||||
val CompiledAdditionalFieldIds: Seq[FieldId] =
|
||||
CompiledAdditionalFieldsMap.keys.toSeq
|
||||
|
||||
/** all [[Tweet]] field IDs which should be rejected when set as additional
|
||||
* fields on via PostTweetRequest.additionalFields or RetweetRequest.additionalFields */
|
||||
val RejectedFieldIds: Seq[FieldId] = Seq(
|
||||
// Should be provided via PostTweetRequest.conversationControl field. go/convocontrolsbackend
|
||||
Tweet.ConversationControlField.id,
|
||||
// This field should only be set based on whether the client sets the right community
|
||||
// tweet annotation.
|
||||
Tweet.CommunitiesField.id,
|
||||
// This field should not be set by clients and should opt for
|
||||
// [[PostTweetRequest.ExclusiveTweetControlOptions]].
|
||||
// The exclusiveTweetControl field requires the userId to be set
|
||||
// and we shouldn't trust the client to provide the right one.
|
||||
Tweet.ExclusiveTweetControlField.id,
|
||||
// This field should not be set by clients and should opt for
|
||||
// [[PostTweetRequest.TrustedFriendsControlOptions]].
|
||||
// The trustedFriendsControl field requires the trustedFriendsListId to be
|
||||
// set and we shouldn't trust the client to provide the right one.
|
||||
Tweet.TrustedFriendsControlField.id,
|
||||
// This field should not be set by clients and should opt for
|
||||
// [[PostTweetRequest.CollabControlOptions]].
|
||||
// The collabControl field requires a list of Collaborators to be
|
||||
// set and we shouldn't trust the client to provide the right one.
|
||||
Tweet.CollabControlField.id
|
||||
)
|
||||
|
||||
def isAdditionalFieldId(fieldId: FieldId): Boolean =
|
||||
fieldId >= StartAdditionalId
|
||||
|
||||
/**
|
||||
* Provides a list of all additional field IDs on the tweet, which include all
|
||||
* the compiled additional fields and all the provided passthrough fields. This includes
|
||||
* compiled additional fields where the value is None.
|
||||
*/
|
||||
def allAdditionalFieldIds(tweet: Tweet): Seq[FieldId] =
|
||||
CompiledAdditionalFieldIds ++ tweet._passthroughFields.keys
|
||||
|
||||
/**
|
||||
* Provides a list of all field IDs that have a value on the tweet which are not known compiled
|
||||
* additional fields (excludes [[Tweet.id]]).
|
||||
*/
|
||||
def unsettableAdditionalFieldIds(tweet: Tweet): Seq[FieldId] =
|
||||
CompiledFieldIds
|
||||
.filter { id =>
|
||||
!isAdditionalFieldId(id) && id != Tweet.IdField.id && tweet.getFieldBlob(id).isDefined
|
||||
} ++
|
||||
tweet._passthroughFields.keys
|
||||
|
||||
/**
|
||||
* Provides a list of all field IDs that have a value on the tweet which are explicitly disallowed
|
||||
* from being set via PostTweetRequest.additionalFields and RetweetRequest.additionalFields
|
||||
*/
|
||||
def rejectedAdditionalFieldIds(tweet: Tweet): Seq[FieldId] =
|
||||
RejectedFieldIds
|
||||
.filter { id => tweet.getFieldBlob(id).isDefined }
|
||||
|
||||
def unsettableAdditionalFieldIdsErrorMessage(unsettableFieldIds: Seq[FieldId]): String =
|
||||
s"request may not contain fields: [${unsettableFieldIds.sorted.mkString(", ")}]"
|
||||
|
||||
/**
|
||||
* Provides a list of all additional field IDs that have a value on the tweet,
|
||||
* compiled and passthrough (excludes Tweet.id).
|
||||
*/
|
||||
def nonEmptyAdditionalFieldIds(tweet: Tweet): Seq[FieldId] =
|
||||
CompiledAdditionalFieldMetaDatas.collect {
|
||||
case f if f.getValue(tweet) != None => f.id
|
||||
} ++ tweet._passthroughFields.keys
|
||||
|
||||
def additionalFields(tweet: Tweet): Seq[TFieldBlob] =
|
||||
(tweet.getFieldBlobs(CompiledAdditionalFieldIds) ++ tweet._passthroughFields).values.toSeq
|
||||
|
||||
/**
|
||||
* Merge base tweet with additional fields.
|
||||
* Non-additional fields in the additional tweet are ignored.
|
||||
* @param base: a tweet that contains basic fields
|
||||
* @param additional: a tweet object that carries additional fields
|
||||
*/
|
||||
def setAdditionalFields(base: Tweet, additional: Tweet): Tweet =
|
||||
setAdditionalFields(base, additionalFields(additional))
|
||||
|
||||
def setAdditionalFields(base: Tweet, additional: Option[Tweet]): Tweet =
|
||||
additional.map(setAdditionalFields(base, _)).getOrElse(base)
|
||||
|
||||
def setAdditionalFields(base: Tweet, additional: Traversable[TFieldBlob]): Tweet =
|
||||
additional.foldLeft(base) { case (t, f) => t.setField(f) }
|
||||
|
||||
/**
|
||||
* Unsets the specified fields on the given tweet.
|
||||
*/
|
||||
def unsetFields(tweet: Tweet, fieldIds: Iterable[FieldId]): Tweet = {
|
||||
tweet.unsetFields(fieldIds.toSet)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
compiler_option_sets = ["fatal_warnings"],
|
||||
platform = "java8",
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
||||
"mediaservices/commons/src/main/thrift:thrift-scala",
|
||||
"scrooge/scrooge-core",
|
||||
"src/thrift/com/twitter/escherbird:media-annotation-structs-scala",
|
||||
"src/thrift/com/twitter/spam/rtf:safety-label-scala",
|
||||
"tweetypie/common/src/thrift/com/twitter/tweetypie:tweet-scala",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,15 @@
|
|||
scala_library(
|
||||
compiler_option_sets = ["fatal_warnings"],
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"finagle/finagle-memcached/src/main/scala",
|
||||
"scrooge/scrooge-serializer",
|
||||
"stitch/stitch-core",
|
||||
"util/util-core",
|
||||
"util/util-logging",
|
||||
# CachedValue struct
|
||||
"tweetypie/servo/repo/src/main/thrift:thrift-scala",
|
||||
"util/util-slf4j-api/src/main/scala/com/twitter/util/logging",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,241 @@
|
|||
package com.twitter.tweetypie.caching
|
||||
|
||||
import com.twitter.finagle.service.StatsFilter
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.finagle.stats.ExceptionStatsHandler
|
||||
import com.twitter.finagle.stats.Counter
|
||||
import com.twitter.util.Future
|
||||
import com.twitter.util.logging.Logger
|
||||
import com.twitter.finagle.memcached
|
||||
import scala.util.control.NonFatal
|
||||
|
||||
/**
|
||||
* Wrapper around a memcached client that performs serialization and
|
||||
* deserialization, tracks stats, provides tracing, and provides
|
||||
* per-key fresh/stale/failure/miss results.
|
||||
*
|
||||
* The operations that write values to cache will only write values
|
||||
* that the ValueSerializer says are cacheable. The idea here is that
|
||||
* the deserialize and serialize functions must be coherent, and no
|
||||
* matter how you choose to write these values back to cache, the
|
||||
* serializer will have the appropriate knowledge about whether the
|
||||
* values are cacheable.
|
||||
*
|
||||
* For most cases, you will want to use [[StitchCaching]] rather than
|
||||
* calling this wrapper directly.
|
||||
*
|
||||
* @param keySerializer How to convert a K value to a memcached key.
|
||||
*
|
||||
* @param valueSerializer How to serialize and deserialize V values,
|
||||
* as well as which values are cacheable, and how long to store the
|
||||
* values in cache.
|
||||
*/
|
||||
class CacheOperations[K, V](
|
||||
keySerializer: K => String,
|
||||
valueSerializer: ValueSerializer[V],
|
||||
memcachedClient: memcached.Client,
|
||||
statsReceiver: StatsReceiver,
|
||||
logger: Logger,
|
||||
exceptionStatsHandler: ExceptionStatsHandler = StatsFilter.DefaultExceptions) {
|
||||
// The memcached operations that are performed via this
|
||||
// [[CacheOperations]] instance will be tracked under this stats
|
||||
// receiver.
|
||||
//
|
||||
// We count all memcached failures together under this scope,
|
||||
// because memcached operations should not fail unless there are
|
||||
// communication problems, so differentiating the method that was
|
||||
// being called will not give us any useful information.
|
||||
private[this] val memcachedStats: StatsReceiver = statsReceiver.scope("memcached")
|
||||
|
||||
// Incremented for every attempt to `get` a key from cache.
|
||||
private[this] val memcachedGetCounter: Counter = memcachedStats.counter("get")
|
||||
|
||||
// One of these two counters is incremented for every successful
|
||||
// response returned from a `get` call to memcached.
|
||||
private[this] val memcachedNotFoundCounter: Counter = memcachedStats.counter("not_found")
|
||||
private[this] val memcachedFoundCounter: Counter = memcachedStats.counter("found")
|
||||
|
||||
// Records the state of the cache load after serialization. The
|
||||
// policy may transform a value that was successfully loaded from
|
||||
// cache into any result type, which is why we explicitly track
|
||||
// "found" and "not_found" above. If `stale` + `fresh` is not equal
|
||||
// to `found`, then it means that the policy has translated a found
|
||||
// value into a miss or failure. The policy may do this in order to
|
||||
// cause the caching filter to treat the value that was found in
|
||||
// cache in the way it would have treated a miss or failure from
|
||||
// cache.
|
||||
private[this] val resultStats: StatsReceiver = statsReceiver.scope("result")
|
||||
private[this] val resultFreshCounter: Counter = resultStats.counter("fresh")
|
||||
private[this] val resultStaleCounter: Counter = resultStats.counter("stale")
|
||||
private[this] val resultMissCounter: Counter = resultStats.counter("miss")
|
||||
private[this] val resultFailureCounter: Counter = resultStats.counter("failure")
|
||||
|
||||
// Used for recording exceptions that occurred during
|
||||
// deserialization. This will never be incremented if the
|
||||
// deserializer returns a result, even if the result is a
|
||||
// [[CacheResult.Failure]]. See the comment where this stat is
|
||||
// incremented for more details.
|
||||
private[this] val deserializeFailureStats: StatsReceiver = statsReceiver.scope("deserialize")
|
||||
|
||||
private[this] val notSerializedCounter: Counter = statsReceiver.counter("not_serialized")
|
||||
|
||||
/**
|
||||
* Load a batch of values from cache. Mostly this deals with
|
||||
* converting the [[memcached.GetResult]] to a
|
||||
* [[Seq[CachedResult[V]]]]. The result is in the same order as the
|
||||
* keys, and there will always be an entry for each key. This method
|
||||
* should never return a [[Future.exception]].
|
||||
*/
|
||||
def get(keys: Seq[K]): Future[Seq[CacheResult[V]]] = {
|
||||
memcachedGetCounter.incr(keys.size)
|
||||
val cacheKeys: Seq[String] = keys.map(keySerializer)
|
||||
if (logger.isTraceEnabled) {
|
||||
logger.trace {
|
||||
val lines: Seq[String] = keys.zip(cacheKeys).map { case (k, c) => s"\n $k ($c)" }
|
||||
"Starting load for keys:" + lines.mkString
|
||||
}
|
||||
}
|
||||
|
||||
memcachedClient
|
||||
.getResult(cacheKeys)
|
||||
.map { getResult =>
|
||||
memcachedNotFoundCounter.incr(getResult.misses.size)
|
||||
val results: Seq[CacheResult[V]] =
|
||||
cacheKeys.map { cacheKey =>
|
||||
val result: CacheResult[V] =
|
||||
getResult.hits.get(cacheKey) match {
|
||||
case Some(memcachedValue) =>
|
||||
memcachedFoundCounter.incr()
|
||||
try {
|
||||
valueSerializer.deserialize(memcachedValue.value)
|
||||
} catch {
|
||||
case NonFatal(e) =>
|
||||
// If the serializer throws an exception, then
|
||||
// the serialized value was malformed. In that
|
||||
// case, we record the failure so that it can be
|
||||
// detected and fixed, but treat it as a cache
|
||||
// miss. The reason that we treat it as a miss
|
||||
// rather than a failure is that a miss will
|
||||
// cause a write back to cache, and we want to
|
||||
// write a valid result back to cache to replace
|
||||
// the bad entry that we just loaded.
|
||||
//
|
||||
// A serializer is free to return Miss itself to
|
||||
// obtain this behavior if it is expected or
|
||||
// desired, to avoid the logging and stats (and
|
||||
// the minor overhead of catching an exception).
|
||||
//
|
||||
// The exceptions are tracked separately from
|
||||
// other exceptions so that it is easy to see
|
||||
// whether the deserializer itself ever throws an
|
||||
// exception.
|
||||
exceptionStatsHandler.record(deserializeFailureStats, e)
|
||||
logger.warn(s"Failed deserializing value for cache key $cacheKey", e)
|
||||
CacheResult.Miss
|
||||
}
|
||||
|
||||
case None if getResult.misses.contains(cacheKey) =>
|
||||
CacheResult.Miss
|
||||
|
||||
case None =>
|
||||
val exception =
|
||||
getResult.failures.get(cacheKey) match {
|
||||
case None =>
|
||||
// To get here, this was not a hit or a miss,
|
||||
// so we expect the key to be present in
|
||||
// failures. If it is not, then either the
|
||||
// contract of getResult was violated, or this
|
||||
// method is somehow attempting to access a
|
||||
// result for a key that was not
|
||||
// loaded. Either of these indicates a bug, so
|
||||
// we log a high priority log message.
|
||||
logger.error(
|
||||
s"Key $cacheKey not found in hits, misses or failures. " +
|
||||
"This indicates a bug in the memcached library or " +
|
||||
"CacheOperations.load"
|
||||
)
|
||||
// We return this as a failure because that
|
||||
// will cause the repo to be consulted and the
|
||||
// value *not* to be written back to cache,
|
||||
// which is probably the safest thing to do
|
||||
// (if we don't know what's going on, default
|
||||
// to an uncached repo).
|
||||
new IllegalStateException
|
||||
|
||||
case Some(e) =>
|
||||
e
|
||||
}
|
||||
exceptionStatsHandler.record(memcachedStats, exception)
|
||||
CacheResult.Failure(exception)
|
||||
}
|
||||
|
||||
// Count each kind of CacheResult, to make it possible to
|
||||
// see how effective the caching is.
|
||||
result match {
|
||||
case CacheResult.Fresh(_) => resultFreshCounter.incr()
|
||||
case CacheResult.Stale(_) => resultStaleCounter.incr()
|
||||
case CacheResult.Miss => resultMissCounter.incr()
|
||||
case CacheResult.Failure(_) => resultFailureCounter.incr()
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
if (logger.isTraceEnabled) {
|
||||
logger.trace {
|
||||
val lines: Seq[String] =
|
||||
(keys, cacheKeys, results).zipped.map {
|
||||
case (key, cacheKey, result) => s"\n $key ($cacheKey) -> $result"
|
||||
}
|
||||
|
||||
"Cache results:" + lines.mkString
|
||||
}
|
||||
}
|
||||
|
||||
results
|
||||
}
|
||||
.handle {
|
||||
case e =>
|
||||
// If there is a failure from the memcached client, fan it
|
||||
// out to each cache key, so that the caller does not need
|
||||
// to handle failure of the batch differently than failure
|
||||
// of individual keys. This should be rare anyway, since the
|
||||
// memcached client already does this for common Finagle
|
||||
// exceptions
|
||||
resultFailureCounter.incr(keys.size)
|
||||
val theFailure: CacheResult[V] = CacheResult.Failure(e)
|
||||
keys.map { _ =>
|
||||
// Record this as many times as we would if it were in the GetResult
|
||||
exceptionStatsHandler.record(memcachedStats, e)
|
||||
theFailure
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Incremented for every attempt to `set` a key in value.
|
||||
private[this] val memcachedSetCounter: Counter = memcachedStats.counter("set")
|
||||
|
||||
/**
|
||||
* Write an entry back to cache, using `set`. If the serializer does
|
||||
* not serialize the value, then this method will immediately return
|
||||
* with success.
|
||||
*/
|
||||
def set(key: K, value: V): Future[Unit] =
|
||||
valueSerializer.serialize(value) match {
|
||||
case Some((expiry, serialized)) =>
|
||||
if (logger.isTraceEnabled) {
|
||||
logger.trace(s"Writing back to cache $key -> $value (expiry = $expiry)")
|
||||
}
|
||||
memcachedSetCounter.incr()
|
||||
memcachedClient
|
||||
.set(key = keySerializer(key), flags = 0, expiry = expiry, value = serialized)
|
||||
.onFailure(exceptionStatsHandler.record(memcachedStats, _))
|
||||
|
||||
case None =>
|
||||
if (logger.isTraceEnabled) {
|
||||
logger.trace(s"Not writing back $key -> $value")
|
||||
}
|
||||
notSerializedCounter.incr()
|
||||
Future.Done
|
||||
}
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
package com.twitter.tweetypie.caching
|
||||
|
||||
/**
|
||||
* Encodes the possible states of a value loaded from memcached.
|
||||
*
|
||||
* @see [[ValueSerializer]] and [[CacheOperations]]
|
||||
*/
|
||||
sealed trait CacheResult[+V]
|
||||
|
||||
object CacheResult {
|
||||
|
||||
/**
|
||||
* Signals that the value could not be successfully loaded from
|
||||
* cache. `Failure` values should not be written back to cache.
|
||||
*
|
||||
* This value may result from an error talking to the memcached
|
||||
* instance or it may be returned from the Serializer when the value
|
||||
* should not be reused, but should also not be overwritten.
|
||||
*/
|
||||
final case class Failure(e: Throwable) extends CacheResult[Nothing]
|
||||
|
||||
/**
|
||||
* Signals that the cache load attempt was successful, but there was
|
||||
* not a usable value.
|
||||
*
|
||||
* When processing a `Miss`, the value should be written back to
|
||||
* cache if it loads successfully.
|
||||
*/
|
||||
case object Miss extends CacheResult[Nothing]
|
||||
|
||||
/**
|
||||
* Signals that the value was found in cache.
|
||||
*
|
||||
* It is not necessary to load the value from the original source.
|
||||
*/
|
||||
case class Fresh[V](value: V) extends CacheResult[V]
|
||||
|
||||
/**
|
||||
* Signals that the value was found in cache.
|
||||
*
|
||||
* This value should be used, but it should be refreshed
|
||||
* out-of-band.
|
||||
*/
|
||||
case class Stale[V](value: V) extends CacheResult[V]
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
package com.twitter.tweetypie.caching
|
||||
|
||||
import com.twitter.util.Duration
|
||||
import com.twitter.util.Time
|
||||
|
||||
/**
|
||||
* Helpers for creating common expiry functions.
|
||||
*
|
||||
* An expiry function maps from the value to a time in the future when
|
||||
* the value should expire from cache. These are useful in the
|
||||
* implementation of a [[ValueSerializer]].
|
||||
*/
|
||||
object Expiry {
|
||||
|
||||
/**
|
||||
* Return a time that indicates to memcached to never expire this
|
||||
* value.
|
||||
*
|
||||
* This function takes [[Any]] so that it can be used at any value
|
||||
* type, since it doesn't examine the value at all.
|
||||
*/
|
||||
val Never: Any => Time =
|
||||
_ => Time.Top
|
||||
|
||||
/**
|
||||
* Return function that indicates to memcached that the value should
|
||||
* not be used after the `ttl` has elapsed.
|
||||
*
|
||||
* This function takes [[Any]] so that it can be used at any value
|
||||
* type, since it doesn't examine the value at all.
|
||||
*/
|
||||
def byAge(ttl: Duration): Any => Time =
|
||||
_ => Time.now + ttl
|
||||
}
|
|
@ -0,0 +1,140 @@
|
|||
package com.twitter.tweetypie.caching
|
||||
|
||||
import com.twitter.io.Buf
|
||||
import com.twitter.scrooge.CompactThriftSerializer
|
||||
import com.twitter.scrooge.ThriftStruct
|
||||
import com.twitter.scrooge.ThriftStructCodec
|
||||
import com.twitter.servo.cache.thriftscala.CachedValue
|
||||
import com.twitter.servo.cache.thriftscala.CachedValueStatus
|
||||
import com.twitter.stitch.NotFound
|
||||
import com.twitter.util.Return
|
||||
import com.twitter.util.Throw
|
||||
import com.twitter.util.Time
|
||||
import com.twitter.util.Try
|
||||
import java.nio.ByteBuffer
|
||||
|
||||
object ServoCachedValueSerializer {
|
||||
|
||||
/**
|
||||
* Thrown when the fields of the servo CachedValue struct do not
|
||||
* satisfy the invariants expected by this serialization code.
|
||||
*/
|
||||
case class UnexpectedCachedValueState(cachedValue: CachedValue) extends Exception {
|
||||
def message: String = s"Unexpected state for CachedValue. Value was: $cachedValue"
|
||||
}
|
||||
|
||||
val CachedValueThriftSerializer: CompactThriftSerializer[CachedValue] = CompactThriftSerializer(
|
||||
CachedValue)
|
||||
}
|
||||
|
||||
/**
|
||||
* A [[ValueSerializer]] that is compatible with the use of
|
||||
* Servo's [[CachedValue]] struct by tweetypie:
|
||||
*
|
||||
* - The only [[CachedValueStatus]] values that are cacheable are
|
||||
* [[CachedValueStatus.Found]] and [[CachedValueStatus.NotFound]].
|
||||
*
|
||||
* - We only track the `cachedAtMsec` field, because tweetypie's cache
|
||||
* interaction does not use the other fields, and the values that
|
||||
* are cached this way are never updated, so storing readThroughAt
|
||||
* or writtenThroughAt would not add any information.
|
||||
*
|
||||
* - When values are present, they are serialized using
|
||||
* [[org.apache.thrift.protocol.TCompactProtocol]].
|
||||
*
|
||||
* - The CachedValue struct itself is also serialized using TCompactProtocol.
|
||||
*
|
||||
* The serializer operates on [[Try]] values and will cache [[Return]]
|
||||
* and `Throw(NotFound)` values.
|
||||
*/
|
||||
case class ServoCachedValueSerializer[V <: ThriftStruct](
|
||||
codec: ThriftStructCodec[V],
|
||||
expiry: Try[V] => Time,
|
||||
softTtl: SoftTtl[Try[V]])
|
||||
extends ValueSerializer[Try[V]] {
|
||||
import ServoCachedValueSerializer.UnexpectedCachedValueState
|
||||
import ServoCachedValueSerializer.CachedValueThriftSerializer
|
||||
|
||||
private[this] val ValueThriftSerializer = CompactThriftSerializer(codec)
|
||||
|
||||
/**
|
||||
* Return an expiry based on the value and a
|
||||
* TCompactProtocol-encoded servo CachedValue struct with the
|
||||
* following fields defined:
|
||||
*
|
||||
* - `value`: [[None]]
|
||||
* for {{{Throw(NotFound)}}, {{{Some(encodedStruct)}}} for
|
||||
* [[Return]], where {{{encodedStruct}}} is a
|
||||
* TCompactProtocol-encoding of the value inside of the Return.
|
||||
*
|
||||
* - `status`: [[CachedValueStatus.Found]] if the value is Return,
|
||||
* and [[CachedValueStatus.NotFound]] if it is Throw(NotFound)
|
||||
*
|
||||
* - `cachedAtMsec`: The current time, accoring to [[Time.now]]
|
||||
*
|
||||
* No other fields will be defined.
|
||||
*
|
||||
* @throws IllegalArgumentException if called with a value that
|
||||
* should not be cached.
|
||||
*/
|
||||
override def serialize(value: Try[V]): Option[(Time, Buf)] = {
|
||||
def serializeCachedValue(payload: Option[ByteBuffer]) = {
|
||||
val cachedValue = CachedValue(
|
||||
value = payload,
|
||||
status = if (payload.isDefined) CachedValueStatus.Found else CachedValueStatus.NotFound,
|
||||
cachedAtMsec = Time.now.inMilliseconds)
|
||||
|
||||
val serialized = Buf.ByteArray.Owned(CachedValueThriftSerializer.toBytes(cachedValue))
|
||||
|
||||
(expiry(value), serialized)
|
||||
}
|
||||
|
||||
value match {
|
||||
case Throw(NotFound) =>
|
||||
Some(serializeCachedValue(None))
|
||||
case Return(struct) =>
|
||||
val payload = Some(ByteBuffer.wrap(ValueThriftSerializer.toBytes(struct)))
|
||||
Some(serializeCachedValue(payload))
|
||||
case _ =>
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Deserializes values serialized by [[serializeValue]]. The
|
||||
* value will be [[CacheResult.Fresh]] or [[CacheResult.Stale]]
|
||||
* depending on the result of {{{softTtl.isFresh}}}.
|
||||
*
|
||||
* @throws UnexpectedCachedValueState if the state of the
|
||||
* [[CachedValue]] could not be produced by [[serialize]]
|
||||
*/
|
||||
override def deserialize(buf: Buf): CacheResult[Try[V]] = {
|
||||
val cachedValue = CachedValueThriftSerializer.fromBytes(Buf.ByteArray.Owned.extract(buf))
|
||||
val hasValue = cachedValue.value.isDefined
|
||||
val isValid =
|
||||
(hasValue && cachedValue.status == CachedValueStatus.Found) ||
|
||||
(!hasValue && cachedValue.status == CachedValueStatus.NotFound)
|
||||
|
||||
if (!isValid) {
|
||||
// Exceptions thrown by deserialization are recorded and treated
|
||||
// as a cache miss by CacheOperations, so throwing this
|
||||
// exception will cause the value in cache to be
|
||||
// overwritten. There will be stats recorded whenever this
|
||||
// happens.
|
||||
throw UnexpectedCachedValueState(cachedValue)
|
||||
}
|
||||
|
||||
val value =
|
||||
cachedValue.value match {
|
||||
case Some(valueBuffer) =>
|
||||
val valueBytes = new Array[Byte](valueBuffer.remaining)
|
||||
valueBuffer.duplicate.get(valueBytes)
|
||||
Return(ValueThriftSerializer.fromBytes(valueBytes))
|
||||
|
||||
case None =>
|
||||
Throw(NotFound)
|
||||
}
|
||||
|
||||
softTtl.toCacheResult(value, Time.fromMilliseconds(cachedValue.cachedAtMsec))
|
||||
}
|
||||
}
|
|
@ -0,0 +1,120 @@
|
|||
package com.twitter.tweetypie.caching
|
||||
|
||||
import com.twitter.util.Duration
|
||||
import com.twitter.util.Time
|
||||
import scala.util.Random
|
||||
import com.twitter.logging.Logger
|
||||
|
||||
/**
|
||||
* Used to determine whether values successfully retrieved from cache
|
||||
* are [[CacheResult.Fresh]] or [[CacheResult.Stale]]. This is useful
|
||||
* in the implementation of a [[ValueSerializer]].
|
||||
*/
|
||||
trait SoftTtl[-V] {
|
||||
|
||||
/**
|
||||
* Determines whether a cached value was fresh.
|
||||
*
|
||||
* @param cachedAt the time at which the value was cached.
|
||||
*/
|
||||
def isFresh(value: V, cachedAt: Time): Boolean
|
||||
|
||||
/**
|
||||
* Wraps the value in Fresh or Stale depending on the value of `isFresh`.
|
||||
*
|
||||
* (The type variable U exists because it is not allowed to return
|
||||
* values of a contravariant type, so we must define a variable that
|
||||
* is a specific subclass of V. This is worth it because it allows
|
||||
* us to create polymorphic policies without having to specify the
|
||||
* type. Another solution would be to make the type invariant, but
|
||||
* then we would have to specify the type whenever we create an
|
||||
* instance.)
|
||||
*/
|
||||
def toCacheResult[U <: V](value: U, cachedAt: Time): CacheResult[U] =
|
||||
if (isFresh(value, cachedAt)) CacheResult.Fresh(value) else CacheResult.Stale(value)
|
||||
}
|
||||
|
||||
object SoftTtl {
|
||||
|
||||
/**
|
||||
* Regardless of the inputs, the value will always be considered
|
||||
* fresh.
|
||||
*/
|
||||
object NeverRefresh extends SoftTtl[Any] {
|
||||
override def isFresh(_unusedValue: Any, _unusedCachedAt: Time): Boolean = true
|
||||
}
|
||||
|
||||
/**
|
||||
* Trigger refresh based on the length of time that a value has been
|
||||
* stored in cache, ignoring the value.
|
||||
*
|
||||
* @param softTtl Items that were cached longer ago than this value
|
||||
* will be refreshed when they are accessed.
|
||||
*
|
||||
* @param jitter Add nondeterminism to the soft TTL to prevent a
|
||||
* thundering herd of requests refreshing the value at the same
|
||||
* time. The time at which the value is considered stale will be
|
||||
* uniformly spread out over a range of +/- (jitter/2). It is
|
||||
* valid to set the jitter to zero, which will turn off jittering.
|
||||
*
|
||||
* @param logger If non-null, use this logger rather than one based
|
||||
* on the class name. This logger is only used for trace-level
|
||||
* logging.
|
||||
*/
|
||||
case class ByAge[V](
|
||||
softTtl: Duration,
|
||||
jitter: Duration,
|
||||
specificLogger: Logger = null,
|
||||
rng: Random = Random)
|
||||
extends SoftTtl[Any] {
|
||||
|
||||
private[this] val logger: Logger =
|
||||
if (specificLogger == null) Logger(getClass) else specificLogger
|
||||
|
||||
private[this] val maxJitterMs: Long = jitter.inMilliseconds
|
||||
|
||||
// this requirement is due to using Random.nextInt to choose the
|
||||
// jitter, but it allows jitter of greater than 24 days
|
||||
require(maxJitterMs <= (Int.MaxValue / 2))
|
||||
|
||||
// Negative jitter probably indicates misuse of the API
|
||||
require(maxJitterMs >= 0)
|
||||
|
||||
// we want period +/- jitter, but the random generator
|
||||
// generates non-negative numbers, so we generate [0, 2 *
|
||||
// maxJitter) and subtract maxJitter to obtain [-maxJitter,
|
||||
// maxJitter)
|
||||
private[this] val maxJitterRangeMs: Int = (maxJitterMs * 2).toInt
|
||||
|
||||
// We perform all calculations in milliseconds, so convert the
|
||||
// period to milliseconds out here.
|
||||
private[this] val softTtlMs: Long = softTtl.inMilliseconds
|
||||
|
||||
// If the value is below this age, it will always be fresh,
|
||||
// regardless of jitter.
|
||||
private[this] val alwaysFreshAgeMs: Long = softTtlMs - maxJitterMs
|
||||
|
||||
// If the value is above this age, it will always be stale,
|
||||
// regardless of jitter.
|
||||
private[this] val alwaysStaleAgeMs: Long = softTtlMs + maxJitterMs
|
||||
|
||||
override def isFresh(value: Any, cachedAt: Time): Boolean = {
|
||||
val ageMs: Long = (Time.now - cachedAt).inMilliseconds
|
||||
val fresh =
|
||||
if (ageMs <= alwaysFreshAgeMs) {
|
||||
true
|
||||
} else if (ageMs > alwaysStaleAgeMs) {
|
||||
false
|
||||
} else {
|
||||
val jitterMs: Long = rng.nextInt(maxJitterRangeMs) - maxJitterMs
|
||||
ageMs <= softTtlMs + jitterMs
|
||||
}
|
||||
|
||||
logger.ifTrace(
|
||||
s"Checked soft ttl: fresh = $fresh, " +
|
||||
s"soft_ttl_ms = $softTtlMs, age_ms = $ageMs, value = $value")
|
||||
|
||||
fresh
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,65 @@
|
|||
package com.twitter.tweetypie.caching
|
||||
|
||||
import scala.collection.mutable
|
||||
import com.twitter.util.Future
|
||||
import com.twitter.stitch.Stitch
|
||||
import com.twitter.stitch.Runner
|
||||
import com.twitter.stitch.FutureRunner
|
||||
import com.twitter.stitch.Group
|
||||
|
||||
/**
|
||||
* Workaround for a infelicity in the implementation of [[Stitch.async]].
|
||||
*
|
||||
* This has the same semantics to [[Stitch.async]], with the exception
|
||||
* that interrupts to the main computation will not interrupt the
|
||||
* async call.
|
||||
*
|
||||
* The problem that this implementation solves is that we do not want
|
||||
* async calls grouped together with synchronous calls. See the
|
||||
* mailing list thread [1] for discussion. This may eventually be
|
||||
* fixed in Stitch.
|
||||
*/
|
||||
private[caching] object StitchAsync {
|
||||
// Contains a deferred Stitch that we want to run asynchronously
|
||||
private[this] class AsyncCall(deferred: => Stitch[_]) {
|
||||
def call(): Stitch[_] = deferred
|
||||
}
|
||||
|
||||
private object AsyncGroup extends Group[AsyncCall, Unit] {
|
||||
override def runner(): Runner[AsyncCall, Unit] =
|
||||
new FutureRunner[AsyncCall, Unit] {
|
||||
// All of the deferred calls of any type. When they are
|
||||
// executed in `run`, the normal Stitch batching and deduping
|
||||
// will occur.
|
||||
private[this] val calls = new mutable.ArrayBuffer[AsyncCall]
|
||||
|
||||
def add(call: AsyncCall): Stitch[Unit] = {
|
||||
// Just remember the deferred call.
|
||||
calls.append(call)
|
||||
|
||||
// Since we don't wait for the completion of the effect,
|
||||
// just return a constant value.
|
||||
Stitch.Unit
|
||||
}
|
||||
|
||||
def run(): Future[_] = {
|
||||
// The future returned from this innter invocation of
|
||||
// Stitch.run is not linked to the returned future, so these
|
||||
// effects are not linked to the outer Run in which this
|
||||
// method was invoked.
|
||||
Stitch.run {
|
||||
Stitch.traverse(calls) { asyncCall: AsyncCall =>
|
||||
asyncCall
|
||||
.call()
|
||||
.liftToTry // So that an exception will not interrupt the other calls
|
||||
}
|
||||
}
|
||||
Future.Unit
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def apply(call: => Stitch[_]): Stitch[Unit] =
|
||||
// Group together all of the async calls
|
||||
Stitch.call(new AsyncCall(call), AsyncGroup)
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
package com.twitter.tweetypie.caching
|
||||
|
||||
import com.twitter.stitch.MapGroup
|
||||
import com.twitter.stitch.Group
|
||||
import com.twitter.stitch.Stitch
|
||||
import com.twitter.util.Future
|
||||
import com.twitter.util.Return
|
||||
import com.twitter.util.Try
|
||||
|
||||
/**
|
||||
* Wrapper around [[CacheOperations]] providing a [[Stitch]] API.
|
||||
*/
|
||||
case class StitchCacheOperations[K, V](operations: CacheOperations[K, V]) {
|
||||
import StitchCacheOperations.SetCall
|
||||
|
||||
private[this] val getGroup: Group[K, CacheResult[V]] =
|
||||
MapGroup[K, CacheResult[V]] { keys: Seq[K] =>
|
||||
operations
|
||||
.get(keys)
|
||||
.map(values => keys.zip(values).toMap.mapValues(Return(_)))
|
||||
}
|
||||
|
||||
def get(key: K): Stitch[CacheResult[V]] =
|
||||
Stitch.call(key, getGroup)
|
||||
|
||||
private[this] val setGroup: Group[SetCall[K, V], Unit] =
|
||||
new MapGroup[SetCall[K, V], Unit] {
|
||||
|
||||
override def run(calls: Seq[SetCall[K, V]]): Future[SetCall[K, V] => Try[Unit]] =
|
||||
Future
|
||||
.collectToTry(calls.map(call => operations.set(call.key, call.value)))
|
||||
.map(tries => calls.zip(tries).toMap)
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs a [[CacheOperations.set]].
|
||||
*/
|
||||
def set(key: K, value: V): Stitch[Unit] =
|
||||
// This is implemented as a Stitch.call instead of a Stitch.future
|
||||
// in order to handle the case where a batch has a duplicate
|
||||
// key. Each copy of the duplicate key will trigger a write back
|
||||
// to cache, so we dedupe the writes in order to avoid the
|
||||
// extraneous RPC call.
|
||||
Stitch.call(new StitchCacheOperations.SetCall(key, value), setGroup)
|
||||
}
|
||||
|
||||
object StitchCacheOperations {
|
||||
|
||||
/**
|
||||
* Used as the "call" for [[SetGroup]]. This is essentially a tuple
|
||||
* where equality is defined only by the key.
|
||||
*/
|
||||
private class SetCall[K, V](val key: K, val value: V) {
|
||||
override def equals(other: Any): Boolean =
|
||||
other match {
|
||||
case setCall: SetCall[_, _] => key == setCall.key
|
||||
case _ => false
|
||||
}
|
||||
|
||||
override def hashCode: Int = key.hashCode
|
||||
}
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
package com.twitter.tweetypie.caching
|
||||
|
||||
import com.twitter.stitch.Stitch
|
||||
|
||||
/**
|
||||
* Apply caching to a [[Stitch]] function.
|
||||
*
|
||||
* @see CacheResult for more information about the semantics
|
||||
* implemented here.
|
||||
*/
|
||||
class StitchCaching[K, V](operations: CacheOperations[K, V], repo: K => Stitch[V])
|
||||
extends (K => Stitch[V]) {
|
||||
|
||||
private[this] val stitchOps = new StitchCacheOperations(operations)
|
||||
|
||||
override def apply(key: K): Stitch[V] =
|
||||
stitchOps.get(key).flatMap {
|
||||
case CacheResult.Fresh(value) =>
|
||||
Stitch.value(value)
|
||||
|
||||
case CacheResult.Stale(staleValue) =>
|
||||
StitchAsync(repo(key).flatMap(refreshed => stitchOps.set(key, refreshed)))
|
||||
.map(_ => staleValue)
|
||||
|
||||
case CacheResult.Miss =>
|
||||
repo(key)
|
||||
.applyEffect(value => StitchAsync(stitchOps.set(key, value)))
|
||||
|
||||
case CacheResult.Failure(_) =>
|
||||
// In the case of failure, we don't attempt to write back to
|
||||
// cache, because cache failure usually means communication
|
||||
// failure, and sending more requests to the cache that holds
|
||||
// the value for this key could make the situation worse.
|
||||
repo(key)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
package com.twitter.tweetypie.caching
|
||||
|
||||
import com.twitter.io.Buf
|
||||
import com.twitter.util.Time
|
||||
|
||||
/**
|
||||
* How to store values of type V in cache. This includes whether a
|
||||
* given value is cacheable, how to serialize it, when it should
|
||||
* expire from cache, and how to interpret byte patterns from cache.
|
||||
*/
|
||||
trait ValueSerializer[V] {
|
||||
|
||||
/**
|
||||
* Prepare the value for storage in cache. When a [[Some]] is
|
||||
* returned, the [[Buf]] should be a valid input to [[deserialize]]
|
||||
* and the [[Time]] will be used as the expiry in the memcached
|
||||
* command. When [[None]] is returned, it indicates that the value
|
||||
* cannot or should not be written back to cache.
|
||||
*
|
||||
* The most common use case for returning None is caching Try
|
||||
* values, where certain exceptional values encode a cacheable state
|
||||
* of a value. In particular, Throw(NotFound) is commonly used to
|
||||
* encode a missing value, and we usually want to cache those
|
||||
* negative lookups, but we don't want to cache e.g. a timeout
|
||||
* exception.
|
||||
*
|
||||
* @return a pair of expiry time for this cache entry and the bytes
|
||||
* to store in cache. If you do not want this value to explicitly
|
||||
* expire, use Time.Top as the expiry.
|
||||
*/
|
||||
def serialize(value: V): Option[(Time, Buf)]
|
||||
|
||||
/**
|
||||
* Deserialize a value found in cache. This function converts the
|
||||
* bytes found in memcache to a [[CacheResult]]. In general, you
|
||||
* probably want to return [[CacheResult.Fresh]] or
|
||||
* [[CacheResult.Stale]], but you are free to return any of the
|
||||
* range of [[CacheResult]]s, depending on the behavior that you
|
||||
* want.
|
||||
*
|
||||
* This is a total function because in the common use case, the
|
||||
* bytes stored in cache will be appropriate for the
|
||||
* serializer. This method is free to throw any exception if the
|
||||
* bytes are not valid.
|
||||
*/
|
||||
def deserialize(serializedValue: Buf): CacheResult[V]
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
compiler_option_sets = ["fatal_warnings"],
|
||||
platform = "java8",
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication",
|
||||
"finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/transport",
|
||||
"finagle/finagle-thrift/src/main/scala",
|
||||
"tweetypie/servo/util/src/main/scala:exception",
|
||||
"strato/src/main/scala/com/twitter/strato/access",
|
||||
"strato/src/main/scala/com/twitter/strato/data",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,185 @@
|
|||
package com.twitter.tweetypie.client_id
|
||||
|
||||
import com.twitter.finagle.mtls.authentication.EmptyServiceIdentifier
|
||||
import com.twitter.finagle.mtls.authentication.ServiceIdentifier
|
||||
import com.twitter.finagle.mtls.transport.S2STransport
|
||||
import com.twitter.finagle.thrift.ClientId
|
||||
import com.twitter.servo.util.Gate
|
||||
import com.twitter.strato.access.Access
|
||||
import com.twitter.strato.access.Access.ForwardedServiceIdentifier
|
||||
|
||||
object ClientIdHelper {
|
||||
|
||||
val UnknownClientId = "unknown"
|
||||
|
||||
def default: ClientIdHelper = new ClientIdHelper(UseTransportServiceIdentifier)
|
||||
|
||||
/**
|
||||
* Trims off the last .element, which is usually .prod or .staging
|
||||
*/
|
||||
def getClientIdRoot(clientId: String): String =
|
||||
clientId.lastIndexOf('.') match {
|
||||
case -1 => clientId
|
||||
case idx => clientId.substring(0, idx)
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the last .element without the '.'
|
||||
*/
|
||||
def getClientIdEnv(clientId: String): String =
|
||||
clientId.lastIndexOf('.') match {
|
||||
case -1 => clientId
|
||||
case idx => clientId.substring(idx + 1)
|
||||
}
|
||||
|
||||
private[client_id] def asClientId(s: ServiceIdentifier): String = s"${s.service}.${s.environment}"
|
||||
}
|
||||
|
||||
class ClientIdHelper(serviceIdentifierStrategy: ServiceIdentifierStrategy) {
|
||||
|
||||
private[client_id] val ProcessPathPrefix = "/p/"
|
||||
|
||||
/**
|
||||
* The effective client id is used for request authorization and metrics
|
||||
* attribution. For calls to Tweetypie's thrift API, the thrift ClientId
|
||||
* is used and is expected in the form of "service-name.env". Federated
|
||||
* Strato clients don't support configured ClientIds and instead provide
|
||||
* a "process path" containing instance-specific information. So for
|
||||
* calls to the federated API, we compute an effective client id from
|
||||
* the ServiceIdentifier, if present, in Strato's Access principles. The
|
||||
* implementation avoids computing this identifier unless necessary,
|
||||
* since this method is invoked on every request.
|
||||
*/
|
||||
def effectiveClientId: Option[String] = {
|
||||
val clientId: Option[String] = ClientId.current.map(_.name)
|
||||
clientId
|
||||
// Exclude process paths because they are instance-specific and aren't
|
||||
// supported by tweetypie for authorization or metrics purposes.
|
||||
.filterNot(_.startsWith(ProcessPathPrefix))
|
||||
// Try computing a value from the ServiceId if the thrift
|
||||
// ClientId is undefined or unsupported.
|
||||
.orElse(serviceIdentifierStrategy.serviceIdentifier.map(ClientIdHelper.asClientId))
|
||||
// Ultimately fall back to the ClientId value, even when given an
|
||||
// unsupported format, so that error text and debug logs include
|
||||
// the value passed by the caller.
|
||||
.orElse(clientId)
|
||||
}
|
||||
|
||||
def effectiveClientIdRoot: Option[String] = effectiveClientId.map(ClientIdHelper.getClientIdRoot)
|
||||
|
||||
def effectiveServiceIdentifier: Option[ServiceIdentifier] =
|
||||
serviceIdentifierStrategy.serviceIdentifier
|
||||
}
|
||||
|
||||
/** Logic how to find a [[ServiceIdentifier]] for the purpose of crafting a client ID. */
|
||||
trait ServiceIdentifierStrategy {
|
||||
def serviceIdentifier: Option[ServiceIdentifier]
|
||||
|
||||
/**
|
||||
* Returns the only element of given [[Set]] or [[None]].
|
||||
*
|
||||
* This utility is used defensively against a set of principals collected
|
||||
* from [[Access.getPrincipals]]. While the contract is that there should be at most one
|
||||
* instance of each principal kind present in that set, in practice that has not been the case
|
||||
* always. The safest strategy to in that case is to abandon a set completely if more than
|
||||
* one principals are competing.
|
||||
*/
|
||||
final protected def onlyElement[T](set: Set[T]): Option[T] =
|
||||
if (set.size <= 1) {
|
||||
set.headOption
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Picks [[ServiceIdentifier]] from Finagle SSL Transport, if one exists.
|
||||
*
|
||||
* This works for both Thrift API calls as well as StratoFed API calls. Strato's
|
||||
* [[Access#getPrincipals]] collection, which would typically be consulted by StratoFed
|
||||
* column logic, contains the same [[ServiceIdentifier]] derived from the Finagle SSL
|
||||
* transport, so there's no need to have separate strategies for Thrift vs StratoFed
|
||||
* calls.
|
||||
*
|
||||
* This is the default behavior of using [[ServiceIdentifier]] for computing client ID.
|
||||
*/
|
||||
private[client_id] class UseTransportServiceIdentifier(
|
||||
// overridable for testing
|
||||
getPeerServiceIdentifier: => ServiceIdentifier,
|
||||
) extends ServiceIdentifierStrategy {
|
||||
override def serviceIdentifier: Option[ServiceIdentifier] =
|
||||
getPeerServiceIdentifier match {
|
||||
case EmptyServiceIdentifier => None
|
||||
case si => Some(si)
|
||||
}
|
||||
}
|
||||
|
||||
object UseTransportServiceIdentifier
|
||||
extends UseTransportServiceIdentifier(S2STransport.peerServiceIdentifier)
|
||||
|
||||
/**
|
||||
* Picks [[ForwardedServiceIdentifier]] from Strato principals for client ID
|
||||
* if [[ServiceIdentifier]] points at call coming from Strato.
|
||||
* If not present, falls back to [[UseTransportServiceIdentifier]] behavior.
|
||||
*
|
||||
* Tweetypie utilizes the strategy to pick [[ServiceIdentifier]] for the purpose
|
||||
* of generating a client ID when the client ID is absent or unknown.
|
||||
* [[PreferForwardedServiceIdentifierForStrato]] looks for the [[ForwardedServiceIdentifier]]
|
||||
* values set by stratoserver request.
|
||||
* The reason is, stratoserver is effectively a conduit, forwarding the [[ServiceIdentifier]]
|
||||
* of the _actual client_ that is calling stratoserver.
|
||||
* Any direct callers not going through stratoserver will default to [[ServiceIdentfier]].
|
||||
*/
|
||||
private[client_id] class PreferForwardedServiceIdentifierForStrato(
|
||||
// overridable for testing
|
||||
getPeerServiceIdentifier: => ServiceIdentifier,
|
||||
) extends ServiceIdentifierStrategy {
|
||||
val useTransportServiceIdentifier =
|
||||
new UseTransportServiceIdentifier(getPeerServiceIdentifier)
|
||||
|
||||
override def serviceIdentifier: Option[ServiceIdentifier] =
|
||||
useTransportServiceIdentifier.serviceIdentifier match {
|
||||
case Some(serviceIdentifier) if isStrato(serviceIdentifier) =>
|
||||
onlyElement(
|
||||
Access.getPrincipals
|
||||
.collect {
|
||||
case forwarded: ForwardedServiceIdentifier =>
|
||||
forwarded.serviceIdentifier.serviceIdentifier
|
||||
}
|
||||
).orElse(useTransportServiceIdentifier.serviceIdentifier)
|
||||
case other => other
|
||||
}
|
||||
|
||||
/**
|
||||
* Strato uses various service names like "stratoserver" and "stratoserver-patient".
|
||||
* They all do start with "stratoserver" though, so at the point of implementing,
|
||||
* the safest bet to recognize strato is to look for this prefix.
|
||||
*
|
||||
* This also works for staged strato instances (which it should), despite allowing
|
||||
* for technically any caller to force this strategy, by creating service certificate
|
||||
* with this service name.
|
||||
*/
|
||||
private def isStrato(serviceIdentifier: ServiceIdentifier): Boolean =
|
||||
serviceIdentifier.service.startsWith("stratoserver")
|
||||
}
|
||||
|
||||
object PreferForwardedServiceIdentifierForStrato
|
||||
extends PreferForwardedServiceIdentifierForStrato(S2STransport.peerServiceIdentifier)
|
||||
|
||||
/**
|
||||
* [[ServiceIdentifierStrategy]] which dispatches between two delegates based on the value
|
||||
* of a unitary decider every time [[serviceIdentifier]] is called.
|
||||
*/
|
||||
class ConditionalServiceIdentifierStrategy(
|
||||
private val condition: Gate[Unit],
|
||||
private val ifTrue: ServiceIdentifierStrategy,
|
||||
private val ifFalse: ServiceIdentifierStrategy)
|
||||
extends ServiceIdentifierStrategy {
|
||||
|
||||
override def serviceIdentifier: Option[ServiceIdentifier] =
|
||||
if (condition()) {
|
||||
ifTrue.serviceIdentifier
|
||||
} else {
|
||||
ifFalse.serviceIdentifier
|
||||
}
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
compiler_option_sets = ["fatal_warnings"],
|
||||
platform = "java8",
|
||||
provides = scala_artifact(
|
||||
org = "com.twitter.tweetypie",
|
||||
name = "context",
|
||||
repo = artifactory,
|
||||
),
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"finagle/finagle-core/src/main",
|
||||
"graphql/common/src/main/scala/com/twitter/graphql/common/core",
|
||||
"src/thrift/com/twitter/context:twitter-context-scala",
|
||||
"twitter-context/src/main/scala",
|
||||
"util/util-core:scala",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,135 @@
|
|||
package com.twitter.tweetypie.context
|
||||
|
||||
import com.twitter.context.TwitterContext
|
||||
import com.twitter.finagle.Filter
|
||||
import com.twitter.finagle.Service
|
||||
import com.twitter.finagle.SimpleFilter
|
||||
import com.twitter.finagle.context.Contexts
|
||||
import com.twitter.io.Buf
|
||||
import com.twitter.io.Buf.ByteArray.Owned
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.graphql.common.core.GraphQlClientApplication
|
||||
import com.twitter.util.Try
|
||||
import java.nio.charset.StandardCharsets.UTF_8
|
||||
import scala.util.matching.Regex
|
||||
|
||||
/**
|
||||
* Context and filters to help track callers of Tweetypie's endpoints. This context and its
|
||||
* filters were originally added to provide visibility into callers of Tweetypie who are
|
||||
* using the birdherd library to access tweets.
|
||||
*
|
||||
* This context data is intended to be marshalled by callers to Tweetypie, but then the
|
||||
* context data is stripped (moved from broadcast to local). This happens so that the
|
||||
* context data is not forwarded down tweetypie's backend rpc chains, which often result
|
||||
* in transitive calls back into tweetypie. This effectively creates single-hop marshalling.
|
||||
*/
|
||||
object TweetypieContext {
|
||||
// Bring Tweetypie permitted TwitterContext into scope
|
||||
val TwitterContext: TwitterContext =
|
||||
com.twitter.context.TwitterContext(com.twitter.tweetypie.TwitterContextPermit)
|
||||
|
||||
case class Ctx(via: String)
|
||||
val Empty = Ctx("")
|
||||
|
||||
object Broadcast {
|
||||
private[this] object Key extends Contexts.broadcast.Key[Ctx](id = Ctx.getClass.getName) {
|
||||
|
||||
override def marshal(value: Ctx): Buf =
|
||||
Owned(value.via.getBytes(UTF_8))
|
||||
|
||||
override def tryUnmarshal(buf: Buf): Try[Ctx] =
|
||||
Try(Ctx(new String(Owned.extract(buf), UTF_8)))
|
||||
}
|
||||
|
||||
private[TweetypieContext] def current(): Option[Ctx] =
|
||||
Contexts.broadcast.get(Key)
|
||||
|
||||
def currentOrElse(default: Ctx): Ctx =
|
||||
current().getOrElse(default)
|
||||
|
||||
def letClear[T](f: => T): T =
|
||||
Contexts.broadcast.letClear(Key)(f)
|
||||
|
||||
def let[T](ctx: Ctx)(f: => T): T =
|
||||
if (Empty == ctx) {
|
||||
letClear(f)
|
||||
} else {
|
||||
Contexts.broadcast.let(Key, ctx)(f)
|
||||
}
|
||||
|
||||
// ctx has to be by name so we can re-evaluate it for every request (for usage in ServiceTwitter.scala)
|
||||
def filter(ctx: => Ctx): Filter.TypeAgnostic =
|
||||
new Filter.TypeAgnostic {
|
||||
override def toFilter[Req, Rep]: Filter[Req, Rep, Req, Rep] =
|
||||
(request: Req, service: Service[Req, Rep]) => Broadcast.let(ctx)(service(request))
|
||||
}
|
||||
}
|
||||
|
||||
object Local {
|
||||
private[this] val Key =
|
||||
new Contexts.local.Key[Ctx]
|
||||
|
||||
private[TweetypieContext] def let[T](ctx: Option[Ctx])(f: => T): T =
|
||||
ctx match {
|
||||
case Some(ctx) if ctx != Empty => Contexts.local.let(Key, ctx)(f)
|
||||
case None => Contexts.local.letClear(Key)(f)
|
||||
}
|
||||
|
||||
def current(): Option[Ctx] =
|
||||
Contexts.local.get(Key)
|
||||
|
||||
def filter[Req, Rep]: SimpleFilter[Req, Rep] =
|
||||
(request: Req, service: Service[Req, Rep]) => {
|
||||
val ctx = Broadcast.current()
|
||||
Broadcast.letClear(Local.let(ctx)(service(request)))
|
||||
}
|
||||
|
||||
private[this] def clientAppIdToName(clientAppId: Long) =
|
||||
GraphQlClientApplication.AllById.get(clientAppId).map(_.name).getOrElse("nonTOO")
|
||||
|
||||
private[this] val pathRegexes: Seq[(Regex, String)] = Seq(
|
||||
("timeline_conversation_.*_json".r, "timeline_conversation__slug__json"),
|
||||
("user_timeline_.*_json".r, "user_timeline__user__json"),
|
||||
("[0-9]{2,}".r, "_id_")
|
||||
)
|
||||
|
||||
// `context.via` will either be a string like: "birdherd" or "birdherd:/1.1/statuses/show/123.json,
|
||||
// depending on whether birdherd code was able to determine the path of the request.
|
||||
private[this] def getViaAndPath(via: String): (String, Option[String]) =
|
||||
via.split(":", 2) match {
|
||||
case Array(via, path) =>
|
||||
val sanitizedPath = path
|
||||
.replace('/', '_')
|
||||
.replace('.', '_')
|
||||
|
||||
// Apply each regex in turn
|
||||
val normalizedPath = pathRegexes.foldLeft(sanitizedPath) {
|
||||
case (path, (regex, replacement)) => regex.replaceAllIn(path, replacement)
|
||||
}
|
||||
|
||||
(via, Some(normalizedPath))
|
||||
case Array(via) => (via, None)
|
||||
}
|
||||
|
||||
def trackStats[U](scopes: StatsReceiver*): Unit =
|
||||
for {
|
||||
tweetypieCtx <- TweetypieContext.Local.current()
|
||||
(via, pathOpt) = getViaAndPath(tweetypieCtx.via)
|
||||
twitterCtx <- TwitterContext()
|
||||
clientAppId <- twitterCtx.clientApplicationId
|
||||
} yield {
|
||||
val clientAppName = clientAppIdToName(clientAppId)
|
||||
scopes.foreach { stats =>
|
||||
val ctxStats = stats.scope("context")
|
||||
val viaStats = ctxStats.scope("via", via)
|
||||
viaStats.scope("all").counter("requests").incr()
|
||||
val viaClientStats = viaStats.scope("by_client", clientAppName)
|
||||
viaClientStats.counter("requests").incr()
|
||||
pathOpt.foreach { path =>
|
||||
val viaPathStats = viaStats.scope("by_path", path)
|
||||
viaPathStats.counter("requests").incr()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
scala_library(
|
||||
sources = ["DeciderGates.scala"],
|
||||
compiler_option_sets = ["fatal_warnings"],
|
||||
platform = "java8",
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/google/guava",
|
||||
"decider",
|
||||
"finagle/finagle-toggle/src/main/scala/com/twitter/finagle/server",
|
||||
"tweetypie/servo/decider",
|
||||
"tweetypie/servo/util/src/main/scala",
|
||||
"util/util-core:scala",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,60 @@
|
|||
package com.twitter.tweetypie
|
||||
package decider
|
||||
|
||||
import com.google.common.hash.Hashing
|
||||
import com.twitter.decider.Decider
|
||||
import com.twitter.decider.Feature
|
||||
import com.twitter.servo.gate.DeciderGate
|
||||
import com.twitter.servo.util.Gate
|
||||
import java.nio.charset.StandardCharsets
|
||||
import scala.collection.mutable
|
||||
trait DeciderGates {
|
||||
def overrides: Map[String, Boolean] = Map.empty
|
||||
def decider: Decider
|
||||
def prefix: String
|
||||
|
||||
protected val seenFeatures: mutable.HashSet[String] = new mutable.HashSet[String]
|
||||
|
||||
private def deciderFeature(name: String): Feature = {
|
||||
decider.feature(prefix + "_" + name)
|
||||
}
|
||||
|
||||
def withOverride[T](name: String, mkGate: Feature => Gate[T]): Gate[T] = {
|
||||
seenFeatures += name
|
||||
overrides.get(name).map(Gate.const).getOrElse(mkGate(deciderFeature(name)))
|
||||
}
|
||||
|
||||
protected def linear(name: String): Gate[Unit] = withOverride[Unit](name, DeciderGate.linear)
|
||||
protected def byId(name: String): Gate[Long] = withOverride[Long](name, DeciderGate.byId)
|
||||
|
||||
/**
|
||||
* It returns a Gate[String] that can be used to check availability of the feature.
|
||||
* The string is hashed into a Long and used as an "id" and then used to call servo's
|
||||
* DeciderGate.byId
|
||||
*
|
||||
* @param name decider name
|
||||
* @return Gate[String]
|
||||
*/
|
||||
protected def byStringId(name: String): Gate[String] =
|
||||
byId(name).contramap { s: String =>
|
||||
Hashing.sipHash24().hashString(s, StandardCharsets.UTF_8).asLong()
|
||||
}
|
||||
|
||||
def all: Traversable[String] = seenFeatures
|
||||
|
||||
def unusedOverrides: Set[String] = overrides.keySet.diff(all.toSet)
|
||||
|
||||
/**
|
||||
* Generate a map of name -> availability, taking into account overrides.
|
||||
* Overrides are either on or off so map to 10000 or 0, respectively.
|
||||
*/
|
||||
def availabilityMap: Map[String, Option[Int]] =
|
||||
all.map { name =>
|
||||
val availability: Option[Int] = overrides
|
||||
.get(name)
|
||||
.map(on => if (on) 10000 else 0)
|
||||
.orElse(deciderFeature(name).availability)
|
||||
|
||||
name -> availability
|
||||
}.toMap
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
compiler_option_sets = ["fatal_warnings"],
|
||||
platform = "java8",
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"decider",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,42 @@
|
|||
package com.twitter.tweetypie.decider.overrides
|
||||
|
||||
import com.twitter.decider.LocalOverrides
|
||||
|
||||
object TweetyPieDeciderOverrides extends LocalOverrides.Namespace("tweetypie", "tweetypie_") {
|
||||
val CheckSpamOnRetweet: LocalOverrides.Override = feature("check_spam_on_retweet")
|
||||
val CheckSpamOnTweet: LocalOverrides.Override = feature("check_spam_on_tweet")
|
||||
val ConversationControlUseFeatureSwitchResults: LocalOverrides.Override = feature(
|
||||
"conversation_control_use_feature_switch_results")
|
||||
val ConversationControlTweetCreateEnabled: LocalOverrides.Override = feature(
|
||||
"conversation_control_tweet_create_enabled")
|
||||
val EnableExclusiveTweetControlValidation: LocalOverrides.Override = feature(
|
||||
"enable_exclusive_tweet_control_validation")
|
||||
val EnableHotKeyCaches: LocalOverrides.Override = feature("enable_hot_key_caches")
|
||||
val HydrateConversationMuted: LocalOverrides.Override = feature("hydrate_conversation_muted")
|
||||
val HydrateExtensionsOnWrite: LocalOverrides.Override = feature("hydrate_extensions_on_write")
|
||||
val HydrateEscherbirdAnnotations: LocalOverrides.Override = feature(
|
||||
"hydrate_escherbird_annotations")
|
||||
val HydrateGnipProfileGeoEnrichment: LocalOverrides.Override = feature(
|
||||
"hydrate_gnip_profile_geo_enrichment")
|
||||
val HydratePastedPics: LocalOverrides.Override = feature("hydrate_pasted_pics")
|
||||
val HydratePerspectivesEditsForOtherSafetyLevels: LocalOverrides.Override = feature(
|
||||
"hydrate_perspectives_edits_for_other_levels")
|
||||
val HydrateScrubEngagements: LocalOverrides.Override = feature("hydrate_scrub_engagements")
|
||||
val LogRepoExceptions: LocalOverrides.Override = feature("log_repo_exceptions")
|
||||
val MediaRefsHydratorIncludePastedMedia: LocalOverrides.Override = feature(
|
||||
"media_refs_hydrator_include_pasted_media")
|
||||
val ShortCircuitLikelyPartialTweetReads: LocalOverrides.Override = feature(
|
||||
"short_circuit_likely_partial_tweet_reads_ms")
|
||||
val RateLimitByLimiterService: LocalOverrides.Override = feature("rate_limit_by_limiter_service")
|
||||
val RateLimitTweetCreationFailure: LocalOverrides.Override = feature(
|
||||
"rate_limit_tweet_creation_failure")
|
||||
val ReplyTweetConversationControlHydrationEnabled = feature(
|
||||
"reply_tweet_conversation_control_hydration_enabled"
|
||||
)
|
||||
val DisableInviteViaMention = feature(
|
||||
"disable_invite_via_mention"
|
||||
)
|
||||
val EnableRemoveUnmentionedImplicitMentions: LocalOverrides.Override = feature(
|
||||
"enable_remove_unmentioned_implicit_mentions")
|
||||
val useReplicatedDeleteTweet2: LocalOverrides.Override = feature("use_replicated_delete_tweet_2")
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
scala_library(
|
||||
compiler_option_sets = ["fatal_warnings"],
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"finagle/finagle-core/src/main",
|
||||
"incentives/jiminy/src/main/thrift/com/twitter/incentives/jiminy:thrift-scala",
|
||||
"tweetypie/servo/util/src/main/scala",
|
||||
"stitch/stitch-core",
|
||||
"strato/src/main/scala/com/twitter/strato/client",
|
||||
"tweetypie/server/src/main/scala/com/twitter/tweetypie/core",
|
||||
"util/util-core",
|
||||
"util/util-stats",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,165 @@
|
|||
package com.twitter.tweetypie.jiminy.tweetypie
|
||||
|
||||
import com.twitter.finagle.stats.CategorizingExceptionStatsHandler
|
||||
import com.twitter.finagle.stats.Stat
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.incentives.jiminy.thriftscala._
|
||||
import com.twitter.servo.util.FutureArrow
|
||||
import com.twitter.servo.util.Gate
|
||||
import com.twitter.stitch.Stitch
|
||||
import com.twitter.strato.thrift.ScroogeConvImplicits._
|
||||
import com.twitter.strato.client.{Client => StratoClient}
|
||||
import com.twitter.tweetypie.core.TweetCreateFailure
|
||||
import com.twitter.util.Future
|
||||
import com.twitter.util.Return
|
||||
import com.twitter.util.Throw
|
||||
|
||||
case class NudgeBuilderRequest(
|
||||
text: String,
|
||||
inReplyToTweetId: Option[NudgeBuilder.TweetId],
|
||||
conversationId: Option[NudgeBuilder.TweetId],
|
||||
hasQuotedTweet: Boolean,
|
||||
nudgeOptions: Option[CreateTweetNudgeOptions],
|
||||
tweetId: Option[NudgeBuilder.TweetId])
|
||||
|
||||
trait NudgeBuilder extends FutureArrow[NudgeBuilderRequest, Unit] {
|
||||
|
||||
/**
|
||||
* Check whether the user should receive a nudge instead of creating
|
||||
* the Tweet. If nudgeOptions is None, then no nudge check will be
|
||||
* performed.
|
||||
*
|
||||
* @return a Future.exception containing a [[TweetCreateFailure]] if the
|
||||
* user should be nudged, or Future.Unit if the user should not be
|
||||
* nudged.
|
||||
*/
|
||||
def apply(
|
||||
request: NudgeBuilderRequest
|
||||
): Future[Unit]
|
||||
}
|
||||
|
||||
object NudgeBuilder {
|
||||
type Type = FutureArrow[NudgeBuilderRequest, Unit]
|
||||
type TweetId = Long
|
||||
|
||||
// darkTrafficCreateNudgeOptions ensure that our dark traffic sends a request that will
|
||||
// accurately test the Jiminy backend. in this case, we specify that we want checks for all
|
||||
// possible nudge types
|
||||
private[this] val darkTrafficCreateNudgeOptions = Some(
|
||||
CreateTweetNudgeOptions(
|
||||
requestedNudgeTypes = Some(
|
||||
Set(
|
||||
TweetNudgeType.PotentiallyToxicTweet,
|
||||
TweetNudgeType.ReviseOrMute,
|
||||
TweetNudgeType.ReviseOrHideThenBlock,
|
||||
TweetNudgeType.ReviseOrBlock
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
private[this] def mkJiminyRequest(
|
||||
request: NudgeBuilderRequest,
|
||||
isDarkRequest: Boolean = false
|
||||
): CreateTweetNudgeRequest = {
|
||||
val tweetType =
|
||||
if (request.inReplyToTweetId.nonEmpty) TweetType.Reply
|
||||
else if (request.hasQuotedTweet) TweetType.QuoteTweet
|
||||
else TweetType.OriginalTweet
|
||||
|
||||
CreateTweetNudgeRequest(
|
||||
tweetText = request.text,
|
||||
tweetType = tweetType,
|
||||
inReplyToTweetId = request.inReplyToTweetId,
|
||||
conversationId = request.conversationId,
|
||||
createTweetNudgeOptions =
|
||||
if (isDarkRequest) darkTrafficCreateNudgeOptions else request.nudgeOptions,
|
||||
tweetId = request.tweetId
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* NudgeBuilder implemented by calling the strato column `incentives/createNudge`.
|
||||
*
|
||||
* Stats recorded:
|
||||
* - latency_ms: Latency histogram (also implicitly number of
|
||||
* invocations). This is counted only in the case that a nudge
|
||||
* check was requested (`nudgeOptions` is non-empty)
|
||||
*
|
||||
* - nudge: The nudge check succeeded and a nudge was created.
|
||||
*
|
||||
* - no_nudge: The nudge check succeeded, but no nudge was created.
|
||||
*
|
||||
* - failures: Calling strato to create a nudge failed. Broken out
|
||||
* by exception.
|
||||
*/
|
||||
|
||||
def apply(
|
||||
nudgeArrow: FutureArrow[CreateTweetNudgeRequest, CreateTweetNudgeResponse],
|
||||
enableDarkTraffic: Gate[Unit],
|
||||
stats: StatsReceiver
|
||||
): NudgeBuilder = {
|
||||
new NudgeBuilder {
|
||||
private[this] val nudgeLatencyStat = stats.stat("latency_ms")
|
||||
private[this] val nudgeCounter = stats.counter("nudge")
|
||||
private[this] val noNudgeCounter = stats.counter("no_nudge")
|
||||
private[this] val darkRequestCounter = stats.counter("dark_request")
|
||||
private[this] val nudgeExceptionHandler = new CategorizingExceptionStatsHandler
|
||||
|
||||
override def apply(
|
||||
request: NudgeBuilderRequest
|
||||
): Future[Unit] =
|
||||
request.nudgeOptions match {
|
||||
case None =>
|
||||
if (enableDarkTraffic()) {
|
||||
darkRequestCounter.incr()
|
||||
Stat
|
||||
.timeFuture(nudgeLatencyStat) {
|
||||
nudgeArrow(mkJiminyRequest(request, isDarkRequest = true))
|
||||
}
|
||||
.transform { _ =>
|
||||
// ignore the response since it is a dark request
|
||||
Future.Done
|
||||
}
|
||||
} else {
|
||||
Future.Done
|
||||
}
|
||||
|
||||
case Some(_) =>
|
||||
Stat
|
||||
.timeFuture(nudgeLatencyStat) {
|
||||
nudgeArrow(mkJiminyRequest(request))
|
||||
}
|
||||
.transform {
|
||||
case Throw(e) =>
|
||||
nudgeExceptionHandler.record(stats, e)
|
||||
// If we failed to invoke the nudge column, then
|
||||
// just continue on with the Tweet creation.
|
||||
Future.Done
|
||||
|
||||
case Return(CreateTweetNudgeResponse(Some(nudge))) =>
|
||||
nudgeCounter.incr()
|
||||
Future.exception(TweetCreateFailure.Nudged(nudge = nudge))
|
||||
|
||||
case Return(CreateTweetNudgeResponse(None)) =>
|
||||
noNudgeCounter.incr()
|
||||
Future.Done
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def apply(
|
||||
strato: StratoClient,
|
||||
enableDarkTraffic: Gate[Unit],
|
||||
stats: StatsReceiver
|
||||
): NudgeBuilder = {
|
||||
val executer =
|
||||
strato.executer[CreateTweetNudgeRequest, CreateTweetNudgeResponse](
|
||||
"incentives/createTweetNudge")
|
||||
val nudgeArrow: FutureArrow[CreateTweetNudgeRequest, CreateTweetNudgeResponse] = { req =>
|
||||
Stitch.run(executer.execute(req))
|
||||
}
|
||||
apply(nudgeArrow, enableDarkTraffic, stats)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
compiler_option_sets = ["fatal_warnings"],
|
||||
platform = "java8",
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"src/java/com/twitter/common/text/language:language-identifier",
|
||||
"src/java/com/twitter/common/text/language:locale-util",
|
||||
"src/java/com/twitter/common/text/pipeline",
|
||||
"src/java/com/twitter/common/text/token",
|
||||
"src/java/com/twitter/common_internal/text",
|
||||
"src/java/com/twitter/common_internal/text/version",
|
||||
"tweetypie/src/resources/com/twitter/tweetypie/matching",
|
||||
"util/util-core/src/main/scala/com/twitter/concurrent",
|
||||
"util/util-core/src/main/scala/com/twitter/io",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,92 @@
|
|||
package com.twitter.tweetypie.matching
|
||||
|
||||
object TokenSequence {
|
||||
|
||||
/**
|
||||
* Is `suffix` a suffix of `s`, starting at `offset` in `s`?
|
||||
*/
|
||||
def hasSuffixAt(s: CharSequence, suffix: CharSequence, offset: Int): Boolean =
|
||||
if (offset == 0 && (s.eq(suffix) || s == suffix)) {
|
||||
true
|
||||
} else if (suffix.length != (s.length - offset)) {
|
||||
false
|
||||
} else {
|
||||
@annotation.tailrec
|
||||
def go(i: Int): Boolean =
|
||||
if (i == suffix.length) true
|
||||
else if (suffix.charAt(i) == s.charAt(offset + i)) go(i + 1)
|
||||
else false
|
||||
|
||||
go(0)
|
||||
}
|
||||
|
||||
/**
|
||||
* Do two [[CharSequence]]s contain the same characters?
|
||||
*
|
||||
* [[CharSequence]] equality is not sufficient because
|
||||
* [[CharSequence]]s of different types may not consider other
|
||||
* [[CharSequence]]s containing the same characters equivalent.
|
||||
*/
|
||||
def sameCharacters(s1: CharSequence, s2: CharSequence): Boolean =
|
||||
hasSuffixAt(s1, s2, 0)
|
||||
|
||||
/**
|
||||
* This method implements the product definition of a token matching a
|
||||
* keyword. That definition is:
|
||||
*
|
||||
* - The token contains the same characters as the keyword.
|
||||
* - The token contains the same characters as the keyword after
|
||||
* dropping a leading '#' or '@' from the token.
|
||||
*
|
||||
* The intention is that a keyword matches an identical hashtag, but
|
||||
* if the keyword itself is a hashtag, it only matches the hashtag
|
||||
* form.
|
||||
*
|
||||
* The tokenization process should rule out tokens or keywords that
|
||||
* start with multiple '#' characters, even though this implementation
|
||||
* allows for e.g. token "##a" to match "#a".
|
||||
*/
|
||||
def tokenMatches(token: CharSequence, keyword: CharSequence): Boolean =
|
||||
if (sameCharacters(token, keyword)) true
|
||||
else if (token.length == 0) false
|
||||
else {
|
||||
val tokenStart = token.charAt(0)
|
||||
(tokenStart == '#' || tokenStart == '@') && hasSuffixAt(token, keyword, 1)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A sequence of normalized tokens. The sequence depends on the locale
|
||||
* in which the text was parsed and the version of the penguin library
|
||||
* that was used at tokenization time.
|
||||
*/
|
||||
case class TokenSequence private[matching] (toIndexedSeq: IndexedSeq[CharSequence]) {
|
||||
import TokenSequence.tokenMatches
|
||||
|
||||
private def apply(i: Int): CharSequence = toIndexedSeq(i)
|
||||
|
||||
def isEmpty: Boolean = toIndexedSeq.isEmpty
|
||||
def nonEmpty: Boolean = toIndexedSeq.nonEmpty
|
||||
|
||||
/**
|
||||
* Does the supplied sequence of keywords match a consecutive sequence
|
||||
* of tokens within this sequence?
|
||||
*/
|
||||
def containsKeywordSequence(keywords: TokenSequence): Boolean = {
|
||||
val finalIndex = toIndexedSeq.length - keywords.toIndexedSeq.length
|
||||
|
||||
@annotation.tailrec
|
||||
def matchesAt(offset: Int, i: Int): Boolean =
|
||||
if (i >= keywords.toIndexedSeq.length) true
|
||||
else if (tokenMatches(this(i + offset), keywords(i))) matchesAt(offset, i + 1)
|
||||
else false
|
||||
|
||||
@annotation.tailrec
|
||||
def search(offset: Int): Boolean =
|
||||
if (offset > finalIndex) false
|
||||
else if (matchesAt(offset, 0)) true
|
||||
else search(offset + 1)
|
||||
|
||||
search(0)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,156 @@
|
|||
package com.twitter.tweetypie.matching
|
||||
|
||||
import com.twitter.common.text.language.LocaleUtil
|
||||
import com.twitter.common_internal.text.pipeline.TwitterTextNormalizer
|
||||
import com.twitter.common_internal.text.pipeline.TwitterTextTokenizer
|
||||
import com.twitter.common_internal.text.version.PenguinVersion
|
||||
import com.twitter.concurrent.Once
|
||||
import com.twitter.io.StreamIO
|
||||
import java.util.Locale
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
/**
|
||||
* Extract a sequence of normalized tokens from the input text. The
|
||||
* normalization and tokenization are properly configured for keyword
|
||||
* matching between texts.
|
||||
*/
|
||||
trait Tokenizer {
|
||||
def tokenize(input: String): TokenSequence
|
||||
}
|
||||
|
||||
object Tokenizer {
|
||||
|
||||
/**
|
||||
* When a Penguin version is not explicitly specified, use this
|
||||
* version of Penguin to perform normalization and tokenization. If
|
||||
* you cache tokenized text, be sure to store the version as well, to
|
||||
* avoid comparing text that was normalized with different algorithms.
|
||||
*/
|
||||
val DefaultPenguinVersion: PenguinVersion = PenguinVersion.PENGUIN_6
|
||||
|
||||
/**
|
||||
* If you already know the locale of the text that is being tokenized,
|
||||
* use this method to get a tokenizer that is much more efficient than
|
||||
* the Tweet or Query tokenizer, since it does not have to perform
|
||||
* language detection.
|
||||
*/
|
||||
def forLocale(locale: Locale): Tokenizer = get(locale, DefaultPenguinVersion)
|
||||
|
||||
/**
|
||||
* Obtain a `Tokenizer` that will tokenize the text for the given
|
||||
* locale and version of the Penguin library.
|
||||
*/
|
||||
def get(locale: Locale, version: PenguinVersion): Tokenizer =
|
||||
TokenizerFactories(version).forLocale(locale)
|
||||
|
||||
/**
|
||||
* Encapsulates the configuration and use of [[TwitterTextTokenizer]]
|
||||
* and [[TwitterTextNormalizer]].
|
||||
*/
|
||||
private[this] class TokenizerFactory(version: PenguinVersion) {
|
||||
// The normalizer is thread-safe, so share one instance.
|
||||
private[this] val normalizer =
|
||||
(new TwitterTextNormalizer.Builder(version)).build()
|
||||
|
||||
// The TwitterTextTokenizer is relatively expensive to build,
|
||||
// and is not thread safe, so keep instances of it in a
|
||||
// ThreadLocal.
|
||||
private[this] val local =
|
||||
new ThreadLocal[TwitterTextTokenizer] {
|
||||
override def initialValue: TwitterTextTokenizer =
|
||||
(new TwitterTextTokenizer.Builder(version)).build()
|
||||
}
|
||||
|
||||
/**
|
||||
* Obtain a [[Tokenizer]] for this combination of [[PenguinVersion]]
|
||||
* and [[Locale]].
|
||||
*/
|
||||
def forLocale(locale: Locale): Tokenizer =
|
||||
new Tokenizer {
|
||||
override def tokenize(input: String): TokenSequence = {
|
||||
val stream = local.get.getTwitterTokenStreamFor(locale)
|
||||
stream.reset(normalizer.normalize(input, locale))
|
||||
val builder = IndexedSeq.newBuilder[CharSequence]
|
||||
while (stream.incrementToken) builder += stream.term()
|
||||
TokenSequence(builder.result())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Since there are a small number of Penguin versions, eagerly
|
||||
* initialize a TokenizerFactory for each version, to avoid managing
|
||||
* mutable state.
|
||||
*/
|
||||
private[this] val TokenizerFactories: PenguinVersion => TokenizerFactory =
|
||||
PenguinVersion.values.map(v => v -> new TokenizerFactory(v)).toMap
|
||||
|
||||
/**
|
||||
* The set of locales used in warmup. These locales are mentioned in
|
||||
* the logic of TwitterTextTokenizer and TwitterTextNormalizer.
|
||||
*/
|
||||
private[this] val WarmUpLocales: Seq[Locale] =
|
||||
Seq
|
||||
.concat(
|
||||
Seq(
|
||||
Locale.JAPANESE,
|
||||
Locale.KOREAN,
|
||||
LocaleUtil.UNKNOWN,
|
||||
LocaleUtil.THAI,
|
||||
LocaleUtil.ARABIC,
|
||||
LocaleUtil.SWEDISH
|
||||
),
|
||||
LocaleUtil.CHINESE_JAPANESE_LOCALES.asScala,
|
||||
LocaleUtil.CJK_LOCALES.asScala
|
||||
)
|
||||
.toSet
|
||||
.toArray
|
||||
.toSeq
|
||||
|
||||
/**
|
||||
* Load the default inputs that are used for warming up this library.
|
||||
*/
|
||||
def warmUpCorpus(): Seq[String] = {
|
||||
val stream = getClass.getResourceAsStream("warmup-text.txt")
|
||||
val bytes =
|
||||
try StreamIO.buffer(stream)
|
||||
finally stream.close()
|
||||
bytes.toString("UTF-8").linesIterator.toArray.toSeq
|
||||
}
|
||||
|
||||
/**
|
||||
* Exercise the functionality of this library on the specified
|
||||
* strings. In general, prefer [[warmUp]] to this method.
|
||||
*/
|
||||
def warmUpWith(ver: PenguinVersion, texts: Iterable[String]): Unit =
|
||||
texts.foreach { txt =>
|
||||
// Exercise each locale
|
||||
WarmUpLocales.foreach { loc =>
|
||||
Tokenizer.get(loc, ver).tokenize(txt)
|
||||
UserMutes.builder().withPenguinVersion(ver).withLocale(loc).validate(txt)
|
||||
}
|
||||
|
||||
// Exercise language detection
|
||||
TweetTokenizer.get(ver).tokenize(txt)
|
||||
UserMutes.builder().withPenguinVersion(ver).validate(txt)
|
||||
}
|
||||
|
||||
private[this] val warmUpOnce = Once(warmUpWith(DefaultPenguinVersion, warmUpCorpus()))
|
||||
|
||||
/**
|
||||
* The creation of the first TwitterTextTokenizer is relatively
|
||||
* expensive, and tokenizing some texts may cause significant
|
||||
* initialization.
|
||||
*
|
||||
* This method exercises the functionality of this library
|
||||
* with a range of texts in order to perform as much initialization as
|
||||
* possible before the library is used in a latency-sensitive way.
|
||||
*
|
||||
* The warmup routine will only run once. Subsequent invocations of
|
||||
* `warmUp` will no do additional work, and will return once warmup is
|
||||
* complete.
|
||||
*
|
||||
* The warmup will take on the order of seconds.
|
||||
*/
|
||||
def warmUp(): Unit = warmUpOnce()
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
package com.twitter.tweetypie.matching
|
||||
|
||||
import com.twitter.common.text.pipeline.TwitterLanguageIdentifier
|
||||
import com.twitter.common_internal.text.version.PenguinVersion
|
||||
import java.util.Locale
|
||||
|
||||
object TweetTokenizer extends Tokenizer {
|
||||
type LocalePicking = Option[Locale] => Tokenizer
|
||||
|
||||
/**
|
||||
* Get a Tokenizer-producing function that uses the supplied locale
|
||||
* to select an appropriate Tokenizer.
|
||||
*/
|
||||
def localePicking: LocalePicking = {
|
||||
case None => TweetTokenizer
|
||||
case Some(locale) => Tokenizer.forLocale(locale)
|
||||
}
|
||||
|
||||
private[this] val tweetLangIdentifier =
|
||||
(new TwitterLanguageIdentifier.Builder).buildForTweet()
|
||||
|
||||
/**
|
||||
* Get a Tokenizer that performs Tweet language detection, and uses
|
||||
* that result to tokenize the text. If you already know the locale of
|
||||
* the tweet text, use `Tokenizer.get`, because it's much
|
||||
* cheaper.
|
||||
*/
|
||||
def get(version: PenguinVersion): Tokenizer =
|
||||
new Tokenizer {
|
||||
override def tokenize(text: String): TokenSequence = {
|
||||
val locale = tweetLangIdentifier.identify(text).getLocale
|
||||
Tokenizer.get(locale, version).tokenize(text)
|
||||
}
|
||||
}
|
||||
|
||||
private[this] val Default = get(Tokenizer.DefaultPenguinVersion)
|
||||
|
||||
/**
|
||||
* Tokenize the given text using Tweet language detection and
|
||||
* `Tokenizer.DefaultPenguinVersion`. Prefer `Tokenizer.forLocale` if
|
||||
* you already know the language of the text.
|
||||
*/
|
||||
override def tokenize(tweetText: String): TokenSequence =
|
||||
Default.tokenize(tweetText)
|
||||
}
|
|
@ -0,0 +1,128 @@
|
|||
package com.twitter.tweetypie.matching
|
||||
|
||||
import com.twitter.common.text.pipeline.TwitterLanguageIdentifier
|
||||
import com.twitter.common_internal.text.version.PenguinVersion
|
||||
import java.util.Locale
|
||||
import scala.collection.JavaConversions.asScalaBuffer
|
||||
|
||||
object UserMutesBuilder {
|
||||
private[matching] val Default =
|
||||
new UserMutesBuilder(Tokenizer.DefaultPenguinVersion, None)
|
||||
|
||||
private val queryLangIdentifier =
|
||||
(new TwitterLanguageIdentifier.Builder).buildForQuery()
|
||||
}
|
||||
|
||||
class UserMutesBuilder private (penguinVersion: PenguinVersion, localeOpt: Option[Locale]) {
|
||||
|
||||
/**
|
||||
* Use the specified Penguin version when tokenizing a keyword mute
|
||||
* string. In general, use the default version, unless you need to
|
||||
* specify a particular version for compatibility with another system
|
||||
* that is using that version.
|
||||
*/
|
||||
def withPenguinVersion(ver: PenguinVersion): UserMutesBuilder =
|
||||
if (ver == penguinVersion) this
|
||||
else new UserMutesBuilder(ver, localeOpt)
|
||||
|
||||
/**
|
||||
* Use the specified locale when tokenizing a keyword mute string.
|
||||
*/
|
||||
def withLocale(locale: Locale): UserMutesBuilder =
|
||||
if (localeOpt.contains(locale)) this
|
||||
else new UserMutesBuilder(penguinVersion, Some(locale))
|
||||
|
||||
/**
|
||||
* When tokenizing a user mute list, detect the language of the
|
||||
* text. This is significantly more expensive than using a predefined
|
||||
* locale, but is appropriate when the locale is not yet known.
|
||||
*/
|
||||
def detectLocale(): UserMutesBuilder =
|
||||
if (localeOpt.isEmpty) this
|
||||
else new UserMutesBuilder(penguinVersion, localeOpt)
|
||||
|
||||
private[this] lazy val tokenizer =
|
||||
localeOpt match {
|
||||
case None =>
|
||||
// No locale was specified, so use a Tokenizer that performs
|
||||
// language detection before tokenizing.
|
||||
new Tokenizer {
|
||||
override def tokenize(text: String): TokenSequence = {
|
||||
val locale = UserMutesBuilder.queryLangIdentifier.identify(text).getLocale
|
||||
Tokenizer.get(locale, penguinVersion).tokenize(text)
|
||||
}
|
||||
}
|
||||
|
||||
case Some(locale) =>
|
||||
Tokenizer.get(locale, penguinVersion)
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a list of the user's raw keyword mutes, return a preprocessed
|
||||
* set of mutes suitable for matching against tweet text. If the input
|
||||
* contains any phrases that fail validation, then they will be
|
||||
* dropped.
|
||||
*/
|
||||
def build(rawInput: Seq[String]): UserMutes =
|
||||
UserMutes(rawInput.flatMap(validate(_).right.toOption))
|
||||
|
||||
/**
|
||||
* Java-friendly API for processing a user's list of raw keyword mutes
|
||||
* into a preprocessed form suitable for matching against text.
|
||||
*/
|
||||
def fromJavaList(rawInput: java.util.List[String]): UserMutes =
|
||||
build(asScalaBuffer(rawInput).toSeq)
|
||||
|
||||
/**
|
||||
* Validate the raw user input muted phrase. Currently, the only
|
||||
* inputs that are not valid for keyword muting are those inputs that
|
||||
* do not contain any keywords, because those inputs would match all
|
||||
* tweets.
|
||||
*/
|
||||
def validate(mutedPhrase: String): Either[UserMutes.ValidationError, TokenSequence] = {
|
||||
val keywords = tokenizer.tokenize(mutedPhrase)
|
||||
if (keywords.isEmpty) UserMutes.EmptyPhraseError else Right(keywords)
|
||||
}
|
||||
}
|
||||
|
||||
object UserMutes {
|
||||
sealed trait ValidationError
|
||||
|
||||
/**
|
||||
* The phrase's tokenization did not produce any tokens
|
||||
*/
|
||||
case object EmptyPhrase extends ValidationError
|
||||
|
||||
private[matching] val EmptyPhraseError = Left(EmptyPhrase)
|
||||
|
||||
/**
|
||||
* Get a [[UserMutesBuilder]] that uses the default Penguin version and
|
||||
* performs language identification to choose a locale.
|
||||
*/
|
||||
def builder(): UserMutesBuilder = UserMutesBuilder.Default
|
||||
}
|
||||
|
||||
/**
|
||||
* A user's muted keyword list, preprocessed into token sequences.
|
||||
*/
|
||||
case class UserMutes private[matching] (toSeq: Seq[TokenSequence]) {
|
||||
|
||||
/**
|
||||
* Do any of the users' muted keyword sequences occur within the
|
||||
* supplied text?
|
||||
*/
|
||||
def matches(text: TokenSequence): Boolean =
|
||||
toSeq.exists(text.containsKeywordSequence)
|
||||
|
||||
/**
|
||||
* Find all positions of matching muted keyword from the user's
|
||||
* muted keyword list
|
||||
*/
|
||||
def find(text: TokenSequence): Seq[Int] =
|
||||
toSeq.zipWithIndex.collect {
|
||||
case (token, index) if text.containsKeywordSequence(token) => index
|
||||
}
|
||||
|
||||
def isEmpty: Boolean = toSeq.isEmpty
|
||||
def nonEmpty: Boolean = toSeq.nonEmpty
|
||||
}
|
17
tweetypie/common/src/scala/com/twitter/tweetypie/media/BUILD
Normal file
17
tweetypie/common/src/scala/com/twitter/tweetypie/media/BUILD
Normal file
|
@ -0,0 +1,17 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
compiler_option_sets = ["fatal_warnings"],
|
||||
platform = "java8",
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"mediaservices/commons/src/main/thrift:thrift-scala",
|
||||
"scrooge/scrooge-core/src/main/scala",
|
||||
"tweetypie/servo/util/src/main/scala",
|
||||
"tweetypie/common/src/thrift/com/twitter/tweetypie:media-entity-scala",
|
||||
"tweetypie/common/src/thrift/com/twitter/tweetypie:tweet-scala",
|
||||
"tco-util",
|
||||
"tweetypie/common/src/scala/com/twitter/tweetypie/util",
|
||||
"util/util-logging/src/main/scala/com/twitter/logging",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,149 @@
|
|||
package com.twitter.tweetypie
|
||||
package media
|
||||
|
||||
import com.twitter.mediaservices.commons.thriftscala.MediaCategory
|
||||
import com.twitter.mediaservices.commons.tweetmedia.thriftscala._
|
||||
import com.twitter.tco_util.TcoSlug
|
||||
import com.twitter.tweetypie.thriftscala._
|
||||
import com.twitter.tweetypie.util.TweetLenses
|
||||
|
||||
/**
|
||||
* A smörgåsbord of media-related helper methods.
|
||||
*/
|
||||
object Media {
|
||||
val AnimatedGifContentType = "video/mp4 codecs=avc1.42E0"
|
||||
|
||||
case class MediaTco(expandedUrl: String, url: String, displayUrl: String)
|
||||
|
||||
val ImageContentTypes: Set[MediaContentType] =
|
||||
Set[MediaContentType](
|
||||
MediaContentType.ImageJpeg,
|
||||
MediaContentType.ImagePng,
|
||||
MediaContentType.ImageGif
|
||||
)
|
||||
|
||||
val AnimatedGifContentTypes: Set[MediaContentType] =
|
||||
Set[MediaContentType](
|
||||
MediaContentType.VideoMp4
|
||||
)
|
||||
|
||||
val VideoContentTypes: Set[MediaContentType] =
|
||||
Set[MediaContentType](
|
||||
MediaContentType.VideoGeneric
|
||||
)
|
||||
|
||||
val InUseContentTypes: Set[MediaContentType] =
|
||||
Set[MediaContentType](
|
||||
MediaContentType.ImageGif,
|
||||
MediaContentType.ImageJpeg,
|
||||
MediaContentType.ImagePng,
|
||||
MediaContentType.VideoMp4,
|
||||
MediaContentType.VideoGeneric
|
||||
)
|
||||
|
||||
def isImage(contentType: MediaContentType): Boolean =
|
||||
ImageContentTypes.contains(contentType)
|
||||
|
||||
def contentTypeToString(contentType: MediaContentType): String =
|
||||
contentType match {
|
||||
case MediaContentType.ImageGif => "image/gif"
|
||||
case MediaContentType.ImageJpeg => "image/jpeg"
|
||||
case MediaContentType.ImagePng => "image/png"
|
||||
case MediaContentType.VideoMp4 => "video/mp4"
|
||||
case MediaContentType.VideoGeneric => "video"
|
||||
case _ => throw new IllegalArgumentException(s"UnknownMediaContentType: $contentType")
|
||||
}
|
||||
|
||||
def stringToContentType(str: String): MediaContentType =
|
||||
str match {
|
||||
case "image/gif" => MediaContentType.ImageGif
|
||||
case "image/jpeg" => MediaContentType.ImageJpeg
|
||||
case "image/png" => MediaContentType.ImagePng
|
||||
case "video/mp4" => MediaContentType.VideoMp4
|
||||
case "video" => MediaContentType.VideoGeneric
|
||||
case _ => throw new IllegalArgumentException(s"Unknown Content Type String: $str")
|
||||
}
|
||||
|
||||
def extensionForContentType(cType: MediaContentType): String =
|
||||
cType match {
|
||||
case MediaContentType.ImageJpeg => "jpg"
|
||||
case MediaContentType.ImagePng => "png"
|
||||
case MediaContentType.ImageGif => "gif"
|
||||
case MediaContentType.VideoMp4 => "mp4"
|
||||
case MediaContentType.VideoGeneric => ""
|
||||
case _ => "unknown"
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract a URL entity from a media entity.
|
||||
*/
|
||||
def extractUrlEntity(mediaEntity: MediaEntity): UrlEntity =
|
||||
UrlEntity(
|
||||
fromIndex = mediaEntity.fromIndex,
|
||||
toIndex = mediaEntity.toIndex,
|
||||
url = mediaEntity.url,
|
||||
expanded = Some(mediaEntity.expandedUrl),
|
||||
display = Some(mediaEntity.displayUrl)
|
||||
)
|
||||
|
||||
/**
|
||||
* Copy the fields from the URL entity into the media entity.
|
||||
*/
|
||||
def copyFromUrlEntity(mediaEntity: MediaEntity, urlEntity: UrlEntity): MediaEntity = {
|
||||
val expandedUrl =
|
||||
urlEntity.expanded.orElse(Option(mediaEntity.expandedUrl)).getOrElse(urlEntity.url)
|
||||
|
||||
val displayUrl =
|
||||
urlEntity.url match {
|
||||
case TcoSlug(slug) => MediaUrl.Display.fromTcoSlug(slug)
|
||||
case _ => urlEntity.expanded.getOrElse(urlEntity.url)
|
||||
}
|
||||
|
||||
mediaEntity.copy(
|
||||
fromIndex = urlEntity.fromIndex,
|
||||
toIndex = urlEntity.toIndex,
|
||||
url = urlEntity.url,
|
||||
expandedUrl = expandedUrl,
|
||||
displayUrl = displayUrl
|
||||
)
|
||||
}
|
||||
|
||||
def getAspectRatio(size: MediaSize): AspectRatio =
|
||||
getAspectRatio(size.width, size.height)
|
||||
|
||||
def getAspectRatio(width: Int, height: Int): AspectRatio = {
|
||||
if (width == 0 || height == 0) {
|
||||
throw new IllegalArgumentException(s"Dimensions must be non zero: ($width, $height)")
|
||||
}
|
||||
|
||||
def calculateGcd(a: Int, b: Int): Int =
|
||||
if (b == 0) a else calculateGcd(b, a % b)
|
||||
|
||||
val gcd = calculateGcd(math.max(width, height), math.min(width, height))
|
||||
AspectRatio((width / gcd).toShort, (height / gcd).toShort)
|
||||
}
|
||||
|
||||
/**
|
||||
* Return just the media that belongs to this tweet
|
||||
*/
|
||||
def ownMedia(tweet: Tweet): Seq[MediaEntity] =
|
||||
TweetLenses.media.get(tweet).filter(isOwnMedia(tweet.id, _))
|
||||
|
||||
/**
|
||||
* Does the given media entity, which is was found on the tweet with the specified
|
||||
* tweetId, belong to that tweet?
|
||||
*/
|
||||
def isOwnMedia(tweetId: TweetId, entity: MediaEntity): Boolean =
|
||||
entity.sourceStatusId.forall(_ == tweetId)
|
||||
|
||||
/**
|
||||
* Mixed Media is any case where there is more than one media item & any of them is not an image.
|
||||
*/
|
||||
|
||||
def isMixedMedia(mediaEntities: Seq[MediaEntity]): Boolean =
|
||||
mediaEntities.length > 1 && (mediaEntities.flatMap(_.mediaInfo).exists {
|
||||
case _: MediaInfo.ImageInfo => false
|
||||
case _ => true
|
||||
} ||
|
||||
mediaEntities.flatMap(_.mediaKey).map(_.mediaCategory).exists(_ != MediaCategory.TweetImage))
|
||||
}
|
|
@ -0,0 +1,108 @@
|
|||
package com.twitter.tweetypie
|
||||
package media
|
||||
|
||||
import com.twitter.logging.Logger
|
||||
import com.twitter.tweetypie.thriftscala.MediaEntity
|
||||
import com.twitter.tweetypie.thriftscala.UrlEntity
|
||||
|
||||
/**
|
||||
* Creating and parsing tweet media entity URLs.
|
||||
*
|
||||
* There are four kinds of URL in a media entity:
|
||||
*
|
||||
* - Display URLs: pic.twitter.com aliases for the short URL, for
|
||||
* embedding in the tweet text.
|
||||
*
|
||||
* - Short URLs: regular t.co URLs that expand to the permalink URL.
|
||||
*
|
||||
* - Permalink URLs: link to a page that displays the media after
|
||||
* doing authorization
|
||||
*
|
||||
* - Asset URLs: links to the actual media asset.
|
||||
*
|
||||
*/
|
||||
object MediaUrl {
|
||||
private[this] val log = Logger(getClass)
|
||||
|
||||
/**
|
||||
* The URL that should be filled in to the displayUrl field of the
|
||||
* media entity. This URL behaves exactly the same as a t.co link
|
||||
* (only the domain is different.)
|
||||
*/
|
||||
object Display {
|
||||
val Root = "pic.twitter.com/"
|
||||
|
||||
def fromTcoSlug(tcoSlug: String): String = Root + tcoSlug
|
||||
}
|
||||
|
||||
/**
|
||||
* The link target for the link in the tweet text (the expanded URL
|
||||
* for the media, copied from the URL entity.) For native photos,
|
||||
* this is the tweet permalink page.
|
||||
*
|
||||
* For users without a screen name ("handleless" or NoScreenName users)
|
||||
* a permalink to /i/status/:tweet_id is used.
|
||||
*/
|
||||
object Permalink {
|
||||
val Root = "https://twitter.com/"
|
||||
val Internal = "i"
|
||||
val PhotoSuffix = "/photo/1"
|
||||
val VideoSuffix = "/video/1"
|
||||
|
||||
def apply(screenName: String, tweetId: TweetId, isVideo: Boolean): String =
|
||||
Root +
|
||||
(if (screenName.isEmpty) Internal else screenName) +
|
||||
"/status/" +
|
||||
tweetId +
|
||||
(if (isVideo) VideoSuffix else PhotoSuffix)
|
||||
|
||||
private[this] val PermalinkRegex =
|
||||
"""https?://twitter.com/(?:#!/)?\w+/status/(\d+)/(?:photo|video)/\d+""".r
|
||||
|
||||
private[this] def getTweetId(permalink: String): Option[TweetId] =
|
||||
permalink match {
|
||||
case PermalinkRegex(tweetIdStr) =>
|
||||
try {
|
||||
Some(tweetIdStr.toLong)
|
||||
} catch {
|
||||
// Digits too big to fit in a Long
|
||||
case _: NumberFormatException => None
|
||||
}
|
||||
case _ => None
|
||||
}
|
||||
|
||||
def getTweetId(urlEntity: UrlEntity): Option[TweetId] =
|
||||
urlEntity.expanded.flatMap(getTweetId)
|
||||
|
||||
def hasTweetId(permalink: String, tweetId: TweetId): Boolean =
|
||||
getTweetId(permalink).contains(tweetId)
|
||||
|
||||
def hasTweetId(mediaEntity: MediaEntity, tweetId: TweetId): Boolean =
|
||||
hasTweetId(mediaEntity.expandedUrl, tweetId)
|
||||
|
||||
def hasTweetId(urlEntity: UrlEntity, tweetId: TweetId): Boolean =
|
||||
getTweetId(urlEntity).contains(tweetId)
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a url that starts with "https://" to one that starts with "http://".
|
||||
*/
|
||||
def httpsToHttp(url: String): String =
|
||||
url.replace("https://", "http://")
|
||||
|
||||
/**
|
||||
* Gets the last path element from an asset url. This exists temporarily to support
|
||||
* the now deprecated mediaPath element in MediaEntity.
|
||||
*/
|
||||
def mediaPathFromUrl(url: String): String =
|
||||
url.lastIndexOf('/') match {
|
||||
case -1 =>
|
||||
log.error("Invalid media path. Could not find last element: " + url)
|
||||
// Better to return a broken preview URL to the client
|
||||
// than to fail the whole request.
|
||||
""
|
||||
|
||||
case idx =>
|
||||
url.substring(idx + 1)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
package com.twitter.tweetypie
|
||||
|
||||
package object media {
|
||||
type TweetId = Long
|
||||
type UserId = Long
|
||||
type MediaId = Long
|
||||
}
|
|
@ -0,0 +1,80 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.stitch.Stitch
|
||||
import com.twitter.storage.client.manhattan.kv.ManhattanValue
|
||||
import com.twitter.tweetypie.storage.TweetUtils.collectWithRateLimitCheck
|
||||
import com.twitter.tweetypie.storage_internal.thriftscala.StoredTweet
|
||||
import com.twitter.tweetypie.thriftscala.Tweet
|
||||
import com.twitter.util.Time
|
||||
|
||||
object AddTweetHandler {
|
||||
private[storage] type InternalAddTweet = (
|
||||
Tweet,
|
||||
ManhattanOperations.Insert,
|
||||
Scribe,
|
||||
StatsReceiver,
|
||||
Time
|
||||
) => Stitch[Unit]
|
||||
|
||||
def apply(
|
||||
insert: ManhattanOperations.Insert,
|
||||
scribe: Scribe,
|
||||
stats: StatsReceiver
|
||||
): TweetStorageClient.AddTweet =
|
||||
tweet => doAddTweet(tweet, insert, scribe, stats, Time.now)
|
||||
|
||||
def makeRecords(
|
||||
storedTweet: StoredTweet,
|
||||
timestamp: Time
|
||||
): Seq[TweetManhattanRecord] = {
|
||||
val core = CoreFieldsCodec.fromTweet(storedTweet)
|
||||
val packedCoreFieldsBlob = CoreFieldsCodec.toTFieldBlob(core)
|
||||
val coreRecord =
|
||||
TweetManhattanRecord(
|
||||
TweetKey.coreFieldsKey(storedTweet.id),
|
||||
ManhattanValue(TFieldBlobCodec.toByteBuffer(packedCoreFieldsBlob), Some(timestamp))
|
||||
)
|
||||
|
||||
val otherFieldIds =
|
||||
TweetFields.nonCoreInternalFields ++ TweetFields.getAdditionalFieldIds(storedTweet)
|
||||
|
||||
val otherFields =
|
||||
storedTweet
|
||||
.getFieldBlobs(otherFieldIds)
|
||||
.map {
|
||||
case (fieldId, tFieldBlob) =>
|
||||
TweetManhattanRecord(
|
||||
TweetKey.fieldKey(storedTweet.id, fieldId),
|
||||
ManhattanValue(TFieldBlobCodec.toByteBuffer(tFieldBlob), Some(timestamp))
|
||||
)
|
||||
}
|
||||
.toSeq
|
||||
otherFields :+ coreRecord
|
||||
}
|
||||
|
||||
private[storage] val doAddTweet: InternalAddTweet = (
|
||||
tweet: Tweet,
|
||||
insert: ManhattanOperations.Insert,
|
||||
scribe: Scribe,
|
||||
stats: StatsReceiver,
|
||||
timestamp: Time
|
||||
) => {
|
||||
assert(tweet.coreData.isDefined, s"Tweet ${tweet.id} is missing coreData: $tweet")
|
||||
|
||||
val storedTweet = StorageConversions.toStoredTweet(tweet)
|
||||
val records = makeRecords(storedTweet, timestamp)
|
||||
val inserts = records.map(insert)
|
||||
val insertsWithRateLimitCheck =
|
||||
Stitch.collect(inserts.map(_.liftToTry)).map(collectWithRateLimitCheck).lowerFromTry
|
||||
|
||||
Stats.updatePerFieldQpsCounters(
|
||||
"addTweet",
|
||||
TweetFields.getAdditionalFieldIds(storedTweet),
|
||||
1,
|
||||
stats
|
||||
)
|
||||
|
||||
insertsWithRateLimitCheck.unit.onSuccess { _ => scribe.logAdded(storedTweet) }
|
||||
}
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
compiler_option_sets = ["fatal_warnings"],
|
||||
platform = "java8",
|
||||
strict_deps = True,
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-incompatible-scaladoc",
|
||||
],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/chuusai:shapeless",
|
||||
"3rdparty/jvm/com/fasterxml/jackson/core:jackson-databind",
|
||||
"3rdparty/jvm/com/fasterxml/jackson/module:jackson-module-scala",
|
||||
"3rdparty/jvm/com/google/guava",
|
||||
"3rdparty/jvm/com/twitter/bijection:core",
|
||||
"3rdparty/jvm/com/twitter/bijection:scrooge",
|
||||
"3rdparty/jvm/com/twitter/bijection:thrift",
|
||||
"3rdparty/jvm/commons-codec",
|
||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
||||
"diffshow",
|
||||
"finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authorization",
|
||||
"finagle/finagle-core/src/main",
|
||||
"finagle/finagle-stats",
|
||||
"finagle/finagle-thriftmux/src/main/scala",
|
||||
"mediaservices/commons/src/main/thrift:thrift-scala",
|
||||
"scrooge/scrooge-serializer/src/main/scala",
|
||||
"tweetypie/servo/repo/src/main/scala",
|
||||
"tweetypie/servo/util",
|
||||
"snowflake:id",
|
||||
"src/thrift/com/twitter/escherbird:media-annotation-structs-scala",
|
||||
"src/thrift/com/twitter/manhattan:internal-scala",
|
||||
"tweetypie/common/src/thrift/com/twitter/tweetypie:media-entity-scala",
|
||||
"tweetypie/common/src/thrift/com/twitter/tweetypie:service-scala",
|
||||
"tweetypie/common/src/thrift/com/twitter/tweetypie:tweet-scala",
|
||||
"stitch/stitch-core",
|
||||
"storage/clients/manhattan/client/src/main/scala",
|
||||
"tbird-thrift:scala",
|
||||
"tweetypie/common/src/scala/com/twitter/tweetypie/additionalfields",
|
||||
"tweetypie/common/src/scala/com/twitter/tweetypie/client_id",
|
||||
"tweetypie/common/src/scala/com/twitter/tweetypie/util",
|
||||
"tweetypie/common/src/thrift/com/twitter/tweetypie/storage_internal:storage_internal-scala",
|
||||
"util-internal/scribe",
|
||||
"util/util-core:scala",
|
||||
"util/util-slf4j-api/src/main/scala/com/twitter/util/logging",
|
||||
"util/util-stats/src/main/scala",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,20 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.util.Time
|
||||
|
||||
object BounceDeleteHandler {
|
||||
def apply(
|
||||
insert: ManhattanOperations.Insert,
|
||||
scribe: Scribe
|
||||
): TweetStorageClient.BounceDelete =
|
||||
tweetId => {
|
||||
val mhTimestamp = Time.now
|
||||
val bounceDeleteRecord = TweetStateRecord
|
||||
.BounceDeleted(tweetId, mhTimestamp.inMillis)
|
||||
.toTweetMhRecord
|
||||
|
||||
insert(bounceDeleteRecord).onSuccess { _ =>
|
||||
scribe.logRemoved(tweetId, mhTimestamp, isSoftDeleted = true)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,242 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.bijection.Conversion.asMethod
|
||||
import com.twitter.bijection.Injection
|
||||
import com.twitter.scrooge.TFieldBlob
|
||||
import com.twitter.storage.client.manhattan.kv._
|
||||
import com.twitter.tweetypie.storage.Response.FieldResponse
|
||||
import com.twitter.tweetypie.storage.Response.FieldResponseCode
|
||||
import com.twitter.tweetypie.storage_internal.thriftscala.CoreFields
|
||||
import com.twitter.tweetypie.storage_internal.thriftscala.InternalTweet
|
||||
import com.twitter.tweetypie.storage_internal.thriftscala.StoredTweet
|
||||
import java.io.ByteArrayOutputStream
|
||||
import java.nio.ByteBuffer
|
||||
import org.apache.thrift.protocol.TBinaryProtocol
|
||||
import org.apache.thrift.transport.TIOStreamTransport
|
||||
import org.apache.thrift.transport.TMemoryInputTransport
|
||||
import scala.collection.immutable
|
||||
import scala.util.control.NoStackTrace
|
||||
|
||||
// NOTE: All field ids and Tweet structure in this file correspond to the StoredTweet struct ONLY
|
||||
|
||||
object ByteArrayCodec {
|
||||
def toByteBuffer(byteArray: Array[Byte]): ByteBuffer = byteArray.as[ByteBuffer]
|
||||
def fromByteBuffer(buffer: ByteBuffer): Array[Byte] = buffer.as[Array[Byte]]
|
||||
}
|
||||
|
||||
object StringCodec {
|
||||
private val string2ByteBuffer = Injection.connect[String, Array[Byte], ByteBuffer]
|
||||
def toByteBuffer(strValue: String): ByteBuffer = string2ByteBuffer(strValue)
|
||||
def fromByteBuffer(buffer: ByteBuffer): String = string2ByteBuffer.invert(buffer).get
|
||||
}
|
||||
|
||||
/**
|
||||
* Terminology
|
||||
* -----------
|
||||
* Tweet id field : The field number of 'tweetId' in the 'Tweet' thrift structure (i.e "1")
|
||||
*
|
||||
* First AdditionalField id : The ID if the first additional field in 'Tweet' thrift structure. All field Ids less than this are
|
||||
* considered internal and all the ids greater than or equal to this field id are considered 'Additional fields'.
|
||||
* This is set to 100.
|
||||
*
|
||||
* Internal Fields : Fields with ids [1 to firstAdditionalFieldid) (excluding firstAdditionalFieldId)
|
||||
*
|
||||
* Core fields : (Subset of Internal fields)- Fields with ids [1 to 8, 19]. These fields are "packed" together and stored
|
||||
* under a single key. This key is referred to as "CoreFieldsKey" (see @TweetKeyType.CoreFieldsKey).
|
||||
* Note: Actually field 1 is skipped when packing as this field is the tweet id and it need not be
|
||||
* explicitly stored since the pkey already contains the tweet Id)
|
||||
*
|
||||
* Root Core field id : The field id under which the packed core fields are stored in Manhattan. (This is field Id "1")
|
||||
*
|
||||
* Required fields : (Subset of Core fields) - Fields with ids [1 to 5] that MUST be present on every tweet.
|
||||
*
|
||||
* Additional Fields : All fields with field ids >= 'firstAdditionalFieldId'
|
||||
*
|
||||
* Compiled Additional fields : (Subset of Additional Fields) - All fields that the storage library knows about
|
||||
* (i.e present on the latest storage_internal.thrift that is compiled-in).
|
||||
*
|
||||
* Passthrough fields : (Subset of Additional Fields) - The fields on storage_internal.thrift that the storage library is NOT aware of
|
||||
* These field ids are is obtained looking at the "_passThroughFields" member of the scrooge-generated
|
||||
* 'Tweet' object.
|
||||
*
|
||||
* coreFieldsIdInInternalTweet: This is the field id of the core fields (the only field) in the Internal Tweet struct
|
||||
*/
|
||||
object TweetFields {
|
||||
val firstAdditionalFieldId: Short = 100
|
||||
val tweetIdField: Short = 1
|
||||
val geoFieldId: Short = 9
|
||||
|
||||
// The field under which all the core field values are stored (in serialized form).
|
||||
val rootCoreFieldId: Short = 1
|
||||
|
||||
val coreFieldIds: immutable.IndexedSeq[FieldId] = {
|
||||
val quotedTweetFieldId: Short = 19
|
||||
(1 to 8).map(_.toShort) ++ Seq(quotedTweetFieldId)
|
||||
}
|
||||
val requiredFieldIds: immutable.IndexedSeq[FieldId] = (1 to 5).map(_.toShort)
|
||||
|
||||
val coreFieldsIdInInternalTweet: Short = 1
|
||||
|
||||
val compiledAdditionalFieldIds: Seq[FieldId] =
|
||||
StoredTweet.metaData.fields.filter(_.id >= firstAdditionalFieldId).map(_.id)
|
||||
val internalFieldIds: Seq[FieldId] =
|
||||
StoredTweet.metaData.fields.filter(_.id < firstAdditionalFieldId).map(_.id)
|
||||
val nonCoreInternalFields: Seq[FieldId] =
|
||||
(internalFieldIds.toSet -- coreFieldIds.toSet).toSeq
|
||||
def getAdditionalFieldIds(tweet: StoredTweet): Seq[FieldId] =
|
||||
compiledAdditionalFieldIds ++ tweet._passthroughFields.keys.toSeq
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper object to convert TFieldBlob to ByteBuffer that gets stored in Manhattan.
|
||||
*
|
||||
* The following is the format in which the TFieldBlob gets stored:
|
||||
* [Version][TField][TFieldBlob]
|
||||
*/
|
||||
object TFieldBlobCodec {
|
||||
val BinaryProtocolFactory: TBinaryProtocol.Factory = new TBinaryProtocol.Factory()
|
||||
val FormatVersion = 1.0
|
||||
|
||||
def toByteBuffer(tFieldBlob: TFieldBlob): ByteBuffer = {
|
||||
val baos = new ByteArrayOutputStream()
|
||||
val prot = BinaryProtocolFactory.getProtocol(new TIOStreamTransport(baos))
|
||||
|
||||
prot.writeDouble(FormatVersion)
|
||||
prot.writeFieldBegin(tFieldBlob.field)
|
||||
prot.writeBinary(ByteArrayCodec.toByteBuffer(tFieldBlob.data))
|
||||
|
||||
ByteArrayCodec.toByteBuffer(baos.toByteArray)
|
||||
}
|
||||
|
||||
def fromByteBuffer(buffer: ByteBuffer): TFieldBlob = {
|
||||
val byteArray = ByteArrayCodec.fromByteBuffer(buffer)
|
||||
val prot = BinaryProtocolFactory.getProtocol(new TMemoryInputTransport(byteArray))
|
||||
|
||||
val version = prot.readDouble()
|
||||
if (version != FormatVersion) {
|
||||
throw new VersionMismatchError(
|
||||
"Version mismatch in decoding ByteBuffer to TFieldBlob. " +
|
||||
"Actual version: " + version + ". Expected version: " + FormatVersion
|
||||
)
|
||||
}
|
||||
|
||||
val tField = prot.readFieldBegin()
|
||||
val dataBuffer = prot.readBinary()
|
||||
val data = ByteArrayCodec.fromByteBuffer(dataBuffer)
|
||||
|
||||
TFieldBlob(tField, data)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper object to help convert 'CoreFields' object to/from TFieldBlob (and also to construct
|
||||
* 'CoreFields' object from a 'StoredTweet' object)
|
||||
*
|
||||
* More details:
|
||||
* - A subset of fields on the 'StoredTweet' thrift structure (2-8,19) are 'packaged' and stored
|
||||
* together as a serialized TFieldBlob object under a single key in Manhattan (see TweetKeyCodec
|
||||
* helper object above for more details).
|
||||
*
|
||||
* - To make the packing/unpacking the fields to/from TFieldBlob object, we created the following
|
||||
* two helper thrift structures 'CoreFields' and 'InternalTweet'
|
||||
*
|
||||
* // The field Ids and types here MUST exactly match field Ids on 'StoredTweet' thrift structure.
|
||||
* struct CoreFields {
|
||||
* 2: optional i64 user_id
|
||||
* ...
|
||||
* 8: optional i64 contributor_id
|
||||
* ...
|
||||
* 19: optional StoredQuotedTweet stored_quoted_tweet
|
||||
*
|
||||
* }
|
||||
*
|
||||
* // The field id of core fields MUST be "1"
|
||||
* struct InternalTweet {
|
||||
* 1: CoreFields coreFields
|
||||
* }
|
||||
*
|
||||
* - Given the above two structures, packing/unpacking fields (2-8,19) on StoredTweet object into a TFieldBlob
|
||||
* becomes very trivial:
|
||||
* For packing:
|
||||
* (i) Copy fields (2-8,19) from StoredTweet object to a new CoreFields object
|
||||
* (ii) Create a new InternalTweet object with the 'CoreFields' object constructed in step (i) above
|
||||
* (iii) Extract field "1" as a TFieldBlob from InternalField (by calling the scrooge generated "getFieldBlob(1)"
|
||||
* function on the InternalField objecton
|
||||
*
|
||||
* For unpacking:
|
||||
* (i) Create an empty 'InternalField' object
|
||||
* (ii) Call scrooge-generated 'setField' by passing the tFieldBlob blob (created by packing steps above)
|
||||
* (iii) Doing step (ii) above will create a hydrated 'CoreField' object that can be accessed by 'coreFields'
|
||||
* member of 'InternalTweet' object.
|
||||
*/
|
||||
object CoreFieldsCodec {
|
||||
val coreFieldIds: Seq[FieldId] = CoreFields.metaData.fields.map(_.id)
|
||||
|
||||
// "Pack" the core fields i.e converts 'CoreFields' object to "packed" tFieldBlob (See description
|
||||
// above for more details)
|
||||
def toTFieldBlob(coreFields: CoreFields): TFieldBlob = {
|
||||
InternalTweet(Some(coreFields)).getFieldBlob(TweetFields.coreFieldsIdInInternalTweet).get
|
||||
}
|
||||
|
||||
// "Unpack" the core fields from a packed TFieldBlob into a CoreFields object (see description above for
|
||||
// more details)
|
||||
def fromTFieldBlob(tFieldBlob: TFieldBlob): CoreFields = {
|
||||
InternalTweet().setField(tFieldBlob).coreFields.get
|
||||
}
|
||||
|
||||
// "Unpack" the core fields from a packed TFieldBlob into a Map of core-fieldId-> TFieldBlob
|
||||
def unpackFields(tFieldBlob: TFieldBlob): Map[Short, TFieldBlob] =
|
||||
fromTFieldBlob(tFieldBlob).getFieldBlobs(coreFieldIds)
|
||||
|
||||
// Create a 'CoreFields' thrift object from 'Tweet' thrift object.
|
||||
def fromTweet(tweet: StoredTweet): CoreFields = {
|
||||
// As mentioned above, the field ids and types on the 'CoreFields' struct exactly match the
|
||||
// corresponding fields on StoredTweet structure. So it is safe to call .getField() on Tweet object and
|
||||
// and pass the returned tFleldBlob a 'setField' on 'CoreFields' object.
|
||||
coreFieldIds.foldLeft(CoreFields()) {
|
||||
case (core, fieldId) =>
|
||||
tweet.getFieldBlob(fieldId) match {
|
||||
case None => core
|
||||
case Some(tFieldBlob) => core.setField(tFieldBlob)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper object to convert ManhattanException to FieldResponseCode thrift object
|
||||
*/
|
||||
object FieldResponseCodeCodec {
|
||||
import FieldResponseCodec.ValueNotFoundException
|
||||
|
||||
def fromManhattanException(mhException: ManhattanException): FieldResponseCode = {
|
||||
mhException match {
|
||||
case _: ValueNotFoundException => FieldResponseCode.ValueNotFound
|
||||
case _: InternalErrorManhattanException => FieldResponseCode.Error
|
||||
case _: InvalidRequestManhattanException => FieldResponseCode.InvalidRequest
|
||||
case _: DeniedManhattanException => FieldResponseCode.Error
|
||||
case _: UnsatisfiableManhattanException => FieldResponseCode.Error
|
||||
case _: TimeoutManhattanException => FieldResponseCode.Timeout
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper object to construct FieldResponse thrift object from an Exception.
|
||||
* This is typically called to convert 'ManhattanException' object to 'FieldResponse' thrift object
|
||||
*/
|
||||
object FieldResponseCodec {
|
||||
class ValueNotFoundException extends ManhattanException("Value not found!") with NoStackTrace
|
||||
private[storage] val NotFound = new ValueNotFoundException
|
||||
|
||||
def fromThrowable(e: Throwable, additionalMsg: Option[String] = None): FieldResponse = {
|
||||
val (respCode, errMsg) = e match {
|
||||
case mhException: ManhattanException =>
|
||||
(FieldResponseCodeCodec.fromManhattanException(mhException), mhException.getMessage)
|
||||
case _ => (FieldResponseCode.Error, e.getMessage)
|
||||
}
|
||||
|
||||
val respMsg = additionalMsg.map(_ + ". " + errMsg).orElse(Some(errMsg.toString))
|
||||
FieldResponse(respCode, respMsg)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.stitch.Stitch
|
||||
import com.twitter.storage.client.manhattan.kv.DeniedManhattanException
|
||||
import com.twitter.tweetypie.storage.TweetUtils._
|
||||
import com.twitter.util.Throw
|
||||
import com.twitter.util.Time
|
||||
|
||||
object DeleteAdditionalFieldsHandler {
|
||||
def apply(
|
||||
delete: ManhattanOperations.Delete,
|
||||
stats: StatsReceiver
|
||||
): TweetStorageClient.DeleteAdditionalFields =
|
||||
(unfilteredTweetIds: Seq[TweetId], additionalFields: Seq[Field]) => {
|
||||
val tweetIds = unfilteredTweetIds.filter(_ > 0)
|
||||
val additionalFieldIds = additionalFields.map(_.id)
|
||||
require(additionalFields.nonEmpty, "Additional fields to delete cannot be empty")
|
||||
require(
|
||||
additionalFieldIds.min >= TweetFields.firstAdditionalFieldId,
|
||||
s"Additional fields $additionalFields must be in additional field range (>= ${TweetFields.firstAdditionalFieldId})"
|
||||
)
|
||||
|
||||
Stats.addWidthStat("deleteAdditionalFields", "tweetIds", tweetIds.size, stats)
|
||||
Stats.addWidthStat(
|
||||
"deleteAdditionalFields",
|
||||
"additionalFieldIds",
|
||||
additionalFieldIds.size,
|
||||
stats
|
||||
)
|
||||
Stats.updatePerFieldQpsCounters(
|
||||
"deleteAdditionalFields",
|
||||
additionalFieldIds,
|
||||
tweetIds.size,
|
||||
stats
|
||||
)
|
||||
val mhTimestamp = Time.now
|
||||
|
||||
val stitches = tweetIds.map { tweetId =>
|
||||
val (fieldIds, mhKeysToDelete) =
|
||||
additionalFieldIds.map { fieldId =>
|
||||
(fieldId, TweetKey.additionalFieldsKey(tweetId, fieldId))
|
||||
}.unzip
|
||||
|
||||
val deletionStitches = mhKeysToDelete.map { mhKeyToDelete =>
|
||||
delete(mhKeyToDelete, Some(mhTimestamp)).liftToTry
|
||||
}
|
||||
|
||||
Stitch.collect(deletionStitches).map { responsesTries =>
|
||||
val wasRateLimited = responsesTries.exists {
|
||||
case Throw(e: DeniedManhattanException) => true
|
||||
case _ => false
|
||||
}
|
||||
|
||||
val resultsPerTweet = fieldIds.zip(responsesTries).toMap
|
||||
|
||||
if (wasRateLimited) {
|
||||
buildTweetOverCapacityResponse("deleteAdditionalFields", tweetId, resultsPerTweet)
|
||||
} else {
|
||||
buildTweetResponse("deleteAdditionalFields", tweetId, resultsPerTweet)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Stitch.collect(stitches)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.tweetypie.additionalfields.AdditionalFields
|
||||
import com.twitter.tweetypie.storage_internal.thriftscala.StoredTweet
|
||||
import com.twitter.tweetypie.thriftscala.{Tweet => TpTweet}
|
||||
|
||||
/**
|
||||
* A field of the stored version of a tweet to read, update, or delete.
|
||||
*
|
||||
* There is not a one-to-one correspondence between the fields ids of
|
||||
* [[com.twitter.tweetypie.thriftscala.Tweet]] and
|
||||
* [[com.twitter.tweetypie.storage_internal.thriftscala.StoredTweet]]. For example, in StoredTweet,
|
||||
* the nsfwUser property is field 11; in Tweet, it is a property of the coreData struct in field 2.
|
||||
* To circumvent the confusion of using one set of field ids or the other, callers use instances of
|
||||
* [[Field]] to reference the part of the object to modify.
|
||||
*/
|
||||
class Field private[storage] (val id: Short) extends AnyVal {
|
||||
override def toString: String = id.toString
|
||||
}
|
||||
|
||||
/**
|
||||
* NOTE: Make sure `AllUpdatableCompiledFields` is kept up to date when adding any new field
|
||||
*/
|
||||
object Field {
|
||||
import AdditionalFields.isAdditionalFieldId
|
||||
val Geo: Field = new Field(StoredTweet.GeoField.id)
|
||||
val HasTakedown: Field = new Field(StoredTweet.HasTakedownField.id)
|
||||
val NsfwUser: Field = new Field(StoredTweet.NsfwUserField.id)
|
||||
val NsfwAdmin: Field = new Field(StoredTweet.NsfwAdminField.id)
|
||||
val TweetypieOnlyTakedownCountryCodes: Field =
|
||||
new Field(TpTweet.TweetypieOnlyTakedownCountryCodesField.id)
|
||||
val TweetypieOnlyTakedownReasons: Field =
|
||||
new Field(TpTweet.TweetypieOnlyTakedownReasonsField.id)
|
||||
|
||||
val AllUpdatableCompiledFields: Set[Field] = Set(Geo, HasTakedown, NsfwUser, NsfwAdmin)
|
||||
|
||||
def additionalField(id: Short): Field = {
|
||||
require(isAdditionalFieldId(id), "field id must be in the additional field range")
|
||||
new Field(id)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,150 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.stitch.Stitch
|
||||
import com.twitter.storage.client.manhattan.kv.DeniedManhattanException
|
||||
import com.twitter.tweetypie.storage.Response.TweetResponseCode
|
||||
import com.twitter.tweetypie.storage.TweetUtils._
|
||||
import com.twitter.tweetypie.storage_internal.thriftscala.StoredTweet
|
||||
import com.twitter.tweetypie.thriftscala.DeletedTweet
|
||||
import scala.util.control.NonFatal
|
||||
|
||||
sealed trait DeleteState
|
||||
object DeleteState {
|
||||
|
||||
/**
|
||||
* This tweet is deleted but has not been permanently deleted from Manhattan. Tweets in this state
|
||||
* may be undeleted.
|
||||
*/
|
||||
case object SoftDeleted extends DeleteState
|
||||
|
||||
/**
|
||||
* This tweet is deleted after being bounced for violating the Twitter Rules but has not been
|
||||
* permanently deleted from Manhattan. Tweets in this state may NOT be undeleted.
|
||||
*/
|
||||
case object BounceDeleted extends DeleteState
|
||||
|
||||
/**
|
||||
* This tweet has been permanently deleted from Manhattan.
|
||||
*/
|
||||
case object HardDeleted extends DeleteState
|
||||
|
||||
/**
|
||||
* There is no data in Manhattan to distinguish this tweet id from one that never existed.
|
||||
*/
|
||||
case object NotFound extends DeleteState
|
||||
|
||||
/**
|
||||
* This tweet exists and is not in a deleted state.
|
||||
*/
|
||||
case object NotDeleted extends DeleteState
|
||||
}
|
||||
|
||||
case class DeletedTweetResponse(
|
||||
tweetId: TweetId,
|
||||
overallResponse: TweetResponseCode,
|
||||
deleteState: DeleteState,
|
||||
tweet: Option[DeletedTweet])
|
||||
|
||||
object GetDeletedTweetsHandler {
|
||||
def apply(
|
||||
read: ManhattanOperations.Read,
|
||||
stats: StatsReceiver
|
||||
): TweetStorageClient.GetDeletedTweets =
|
||||
(unfilteredTweetIds: Seq[TweetId]) => {
|
||||
val tweetIds = unfilteredTweetIds.filter(_ > 0)
|
||||
|
||||
Stats.addWidthStat("getDeletedTweets", "tweetIds", tweetIds.size, stats)
|
||||
|
||||
val stitches = tweetIds.map { tweetId =>
|
||||
read(tweetId)
|
||||
.map { mhRecords =>
|
||||
val storedTweet = buildStoredTweet(tweetId, mhRecords)
|
||||
|
||||
TweetStateRecord.mostRecent(mhRecords) match {
|
||||
case Some(m: TweetStateRecord.SoftDeleted) => softDeleted(m, storedTweet)
|
||||
case Some(m: TweetStateRecord.BounceDeleted) => bounceDeleted(m, storedTweet)
|
||||
case Some(m: TweetStateRecord.HardDeleted) => hardDeleted(m, storedTweet)
|
||||
case _ if storedTweet.getFieldBlobs(expectedFields).isEmpty => notFound(tweetId)
|
||||
case _ => notDeleted(tweetId, storedTweet)
|
||||
}
|
||||
}
|
||||
.handle {
|
||||
case _: DeniedManhattanException =>
|
||||
DeletedTweetResponse(
|
||||
tweetId,
|
||||
TweetResponseCode.OverCapacity,
|
||||
DeleteState.NotFound,
|
||||
None
|
||||
)
|
||||
|
||||
case NonFatal(ex) =>
|
||||
TweetUtils.log.warning(
|
||||
ex,
|
||||
s"Unhandled exception in GetDeletedTweetsHandler for tweetId: $tweetId"
|
||||
)
|
||||
DeletedTweetResponse(tweetId, TweetResponseCode.Failure, DeleteState.NotFound, None)
|
||||
}
|
||||
}
|
||||
|
||||
Stitch.collect(stitches)
|
||||
}
|
||||
|
||||
private def notFound(tweetId: TweetId) =
|
||||
DeletedTweetResponse(
|
||||
tweetId = tweetId,
|
||||
overallResponse = TweetResponseCode.Success,
|
||||
deleteState = DeleteState.NotFound,
|
||||
tweet = None
|
||||
)
|
||||
|
||||
private def softDeleted(record: TweetStateRecord.SoftDeleted, storedTweet: StoredTweet) =
|
||||
DeletedTweetResponse(
|
||||
record.tweetId,
|
||||
TweetResponseCode.Success,
|
||||
DeleteState.SoftDeleted,
|
||||
Some(
|
||||
StorageConversions
|
||||
.toDeletedTweet(storedTweet)
|
||||
.copy(deletedAtMsec = Some(record.createdAt))
|
||||
)
|
||||
)
|
||||
|
||||
private def bounceDeleted(record: TweetStateRecord.BounceDeleted, storedTweet: StoredTweet) =
|
||||
DeletedTweetResponse(
|
||||
record.tweetId,
|
||||
TweetResponseCode.Success,
|
||||
DeleteState.BounceDeleted,
|
||||
Some(
|
||||
StorageConversions
|
||||
.toDeletedTweet(storedTweet)
|
||||
.copy(deletedAtMsec = Some(record.createdAt))
|
||||
)
|
||||
)
|
||||
|
||||
private def hardDeleted(record: TweetStateRecord.HardDeleted, storedTweet: StoredTweet) =
|
||||
DeletedTweetResponse(
|
||||
record.tweetId,
|
||||
TweetResponseCode.Success,
|
||||
DeleteState.HardDeleted,
|
||||
Some(
|
||||
StorageConversions
|
||||
.toDeletedTweet(storedTweet)
|
||||
.copy(
|
||||
hardDeletedAtMsec = Some(record.createdAt),
|
||||
deletedAtMsec = Some(record.deletedAt)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
/**
|
||||
* notDeleted returns a tweet to simplify tweetypie.handler.UndeleteTweetHandler
|
||||
*/
|
||||
private def notDeleted(tweetId: TweetId, storedTweet: StoredTweet) =
|
||||
DeletedTweetResponse(
|
||||
tweetId = tweetId,
|
||||
overallResponse = TweetResponseCode.Success,
|
||||
deleteState = DeleteState.NotDeleted,
|
||||
tweet = Some(StorageConversions.toDeletedTweet(storedTweet))
|
||||
)
|
||||
}
|
|
@ -0,0 +1,126 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.conversions.DurationOps._
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.stitch.Stitch
|
||||
import com.twitter.stitch.StitchSeqGroup
|
||||
import com.twitter.tweetypie.storage.TweetStorageClient.GetStoredTweet
|
||||
import com.twitter.tweetypie.storage.TweetStorageClient.GetStoredTweet.Error
|
||||
import com.twitter.tweetypie.storage.TweetStorageClient.GetStoredTweet.Response._
|
||||
import com.twitter.tweetypie.storage.TweetUtils._
|
||||
import com.twitter.tweetypie.thriftscala.Tweet
|
||||
import com.twitter.util.Time
|
||||
import com.twitter.util.Try
|
||||
import scala.collection.mutable
|
||||
|
||||
object GetStoredTweetHandler {
|
||||
private[this] object DeletedState {
|
||||
def unapply(stateRecord: Option[TweetStateRecord]): Option[TweetStateRecord] =
|
||||
stateRecord match {
|
||||
case state @ (Some(_: TweetStateRecord.SoftDeleted) | Some(
|
||||
_: TweetStateRecord.HardDeleted) | Some(_: TweetStateRecord.BounceDeleted)) =>
|
||||
state
|
||||
case _ => None
|
||||
}
|
||||
}
|
||||
|
||||
private[this] def deletedAtMs(stateRecord: Option[TweetStateRecord]): Option[Long] =
|
||||
stateRecord match {
|
||||
case Some(d: TweetStateRecord.SoftDeleted) => Some(d.createdAt)
|
||||
case Some(d: TweetStateRecord.BounceDeleted) => Some(d.createdAt)
|
||||
case Some(d: TweetStateRecord.HardDeleted) => Some(d.deletedAt)
|
||||
case _ => None
|
||||
}
|
||||
|
||||
private[this] def tweetResponseFromRecords(
|
||||
tweetId: TweetId,
|
||||
mhRecords: Seq[TweetManhattanRecord],
|
||||
statsReceiver: StatsReceiver,
|
||||
): GetStoredTweet.Response = {
|
||||
val errs =
|
||||
mutable.Buffer[Error]()
|
||||
|
||||
val hasStoredTweetFields: Boolean = mhRecords.exists {
|
||||
case TweetManhattanRecord(TweetKey(_, _: TweetKey.LKey.FieldKey), _) => true
|
||||
case _ => false
|
||||
}
|
||||
|
||||
val storedTweet = if (hasStoredTweetFields) {
|
||||
Try(buildStoredTweet(tweetId, mhRecords, includeScrubbed = true))
|
||||
.onFailure(_ => errs.append(Error.TweetIsCorrupt))
|
||||
.toOption
|
||||
} else {
|
||||
None
|
||||
}
|
||||
|
||||
val scrubbedFields: Set[FieldId] = extractScrubbedFields(mhRecords)
|
||||
val tweet: Option[Tweet] = storedTweet.map(StorageConversions.fromStoredTweetAllowInvalid)
|
||||
val stateRecords: Seq[TweetStateRecord] = TweetStateRecord.fromTweetMhRecords(mhRecords)
|
||||
val tweetState: Option[TweetStateRecord] = TweetStateRecord.mostRecent(mhRecords)
|
||||
|
||||
storedTweet.foreach { storedTweet =>
|
||||
val storedExpectedFields = storedTweet.getFieldBlobs(expectedFields)
|
||||
val missingExpectedFields = expectedFields.filterNot(storedExpectedFields.contains)
|
||||
if (missingExpectedFields.nonEmpty || !isValid(storedTweet)) {
|
||||
errs.append(Error.TweetFieldsMissingOrInvalid)
|
||||
}
|
||||
|
||||
val invalidScrubbedFields = storedTweet.getFieldBlobs(scrubbedFields).keys
|
||||
if (invalidScrubbedFields.nonEmpty) {
|
||||
errs.append(Error.ScrubbedFieldsPresent)
|
||||
}
|
||||
|
||||
if (deletedAtMs(tweetState).exists(_ < Time.now.inMilliseconds - 14.days.inMilliseconds)) {
|
||||
errs.append(Error.TweetShouldBeHardDeleted)
|
||||
}
|
||||
}
|
||||
|
||||
val err = Option(errs.toList).filter(_.nonEmpty)
|
||||
|
||||
(tweet, tweetState, err) match {
|
||||
case (None, None, None) =>
|
||||
statsReceiver.counter("not_found").incr()
|
||||
NotFound(tweetId)
|
||||
|
||||
case (None, Some(tweetState: TweetStateRecord.HardDeleted), None) =>
|
||||
statsReceiver.counter("hard_deleted").incr()
|
||||
HardDeleted(tweetId, Some(tweetState), stateRecords, scrubbedFields)
|
||||
|
||||
case (None, _, Some(errs)) =>
|
||||
statsReceiver.counter("failed").incr()
|
||||
Failed(tweetId, tweetState, stateRecords, scrubbedFields, errs)
|
||||
|
||||
case (Some(tweet), _, Some(errs)) =>
|
||||
statsReceiver.counter("found_invalid").incr()
|
||||
FoundWithErrors(tweet, tweetState, stateRecords, scrubbedFields, errs)
|
||||
|
||||
case (Some(tweet), DeletedState(state), None) =>
|
||||
statsReceiver.counter("deleted").incr()
|
||||
FoundDeleted(tweet, Some(state), stateRecords, scrubbedFields)
|
||||
|
||||
case (Some(tweet), _, None) =>
|
||||
statsReceiver.counter("found").incr()
|
||||
Found(tweet, tweetState, stateRecords, scrubbedFields)
|
||||
}
|
||||
}
|
||||
|
||||
def apply(read: ManhattanOperations.Read, statsReceiver: StatsReceiver): GetStoredTweet = {
|
||||
|
||||
object mhGroup extends StitchSeqGroup[TweetId, Seq[TweetManhattanRecord]] {
|
||||
override def run(tweetIds: Seq[TweetId]): Stitch[Seq[Seq[TweetManhattanRecord]]] = {
|
||||
Stats.addWidthStat("getStoredTweet", "tweetIds", tweetIds.size, statsReceiver)
|
||||
Stitch.traverse(tweetIds)(read(_))
|
||||
}
|
||||
}
|
||||
|
||||
tweetId =>
|
||||
if (tweetId <= 0) {
|
||||
Stitch.NotFound
|
||||
} else {
|
||||
Stitch
|
||||
.call(tweetId, mhGroup)
|
||||
.map(mhRecords =>
|
||||
tweetResponseFromRecords(tweetId, mhRecords, statsReceiver.scope("getStoredTweet")))
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,167 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.conversions.DurationOps._
|
||||
import com.twitter.finagle.stats.Counter
|
||||
import com.twitter.finagle.stats.NullStatsReceiver
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.logging.Logger
|
||||
import com.twitter.snowflake.id.SnowflakeId
|
||||
import com.twitter.stitch.Stitch
|
||||
import com.twitter.stitch.StitchSeqGroup
|
||||
import com.twitter.storage.client.manhattan.kv.DeniedManhattanException
|
||||
import com.twitter.storage.client.manhattan.kv.ManhattanException
|
||||
import com.twitter.tweetypie.storage.TweetStateRecord.BounceDeleted
|
||||
import com.twitter.tweetypie.storage.TweetStateRecord.HardDeleted
|
||||
import com.twitter.tweetypie.storage.TweetStateRecord.SoftDeleted
|
||||
import com.twitter.tweetypie.storage.TweetStorageClient.GetTweet
|
||||
import com.twitter.tweetypie.storage.TweetUtils._
|
||||
import com.twitter.util.Duration
|
||||
import com.twitter.util.Return
|
||||
import com.twitter.util.Throw
|
||||
import com.twitter.util.Time
|
||||
|
||||
object GetTweetHandler {
|
||||
private[this] val logger = Logger(getClass)
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// Logging racy reads for later validation.
|
||||
|
||||
val RacyTweetWindow: Duration = 10.seconds
|
||||
|
||||
/**
|
||||
* If this read is soon after the tweet was created, then we would usually
|
||||
* expect it to be served from cache. This early read indicates that this
|
||||
* tweet is prone to consistency issues, so we log what's present in
|
||||
* Manhattan at the time of the read for later analysis.
|
||||
*/
|
||||
private[this] def logRacyRead(tweetId: TweetId, records: Seq[TweetManhattanRecord]): Unit =
|
||||
if (SnowflakeId.isSnowflakeId(tweetId)) {
|
||||
val tweetAge = Time.now.since(SnowflakeId(tweetId).time)
|
||||
if (tweetAge <= RacyTweetWindow) {
|
||||
val sb = new StringBuilder
|
||||
sb.append("racy_tweet_read\t")
|
||||
.append(tweetId)
|
||||
.append('\t')
|
||||
.append(tweetAge.inMilliseconds) // Log the age for analysis purposes
|
||||
records.foreach { rec =>
|
||||
sb.append('\t')
|
||||
.append(rec.lkey)
|
||||
rec.value.timestamp.foreach { ts =>
|
||||
// If there is a timestamp for this key, log it so that we can tell
|
||||
// later on whether a value should have been present. We expect
|
||||
// keys written in a single write to have the same timestamp, and
|
||||
// generally, keys written in separate writes will have different
|
||||
// timestamps. The timestamp value is optional in Manhattan, but
|
||||
// we expect there to always be a value for the timestamp.
|
||||
sb.append(':')
|
||||
.append(ts.inMilliseconds)
|
||||
}
|
||||
}
|
||||
logger.info(sb.toString)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a set of records from Manhattan into a GetTweet.Response.
|
||||
*/
|
||||
def tweetResponseFromRecords(
|
||||
tweetId: TweetId,
|
||||
mhRecords: Seq[TweetManhattanRecord],
|
||||
statsReceiver: StatsReceiver = NullStatsReceiver
|
||||
): GetTweet.Response =
|
||||
if (mhRecords.isEmpty) {
|
||||
GetTweet.Response.NotFound
|
||||
} else {
|
||||
// If no internal fields are present or no required fields present, we consider the tweet
|
||||
// as not returnable (even if some additional fields are present)
|
||||
def tweetFromRecords(tweetId: TweetId, mhRecords: Seq[TweetManhattanRecord]) = {
|
||||
val storedTweet = buildStoredTweet(tweetId, mhRecords)
|
||||
if (storedTweet.getFieldBlobs(expectedFields).nonEmpty) {
|
||||
if (isValid(storedTweet)) {
|
||||
statsReceiver.counter("valid").incr()
|
||||
Some(StorageConversions.fromStoredTweet(storedTweet))
|
||||
} else {
|
||||
log.info(s"Invalid Tweet Id: $tweetId")
|
||||
statsReceiver.counter("invalid").incr()
|
||||
None
|
||||
}
|
||||
} else {
|
||||
// The Tweet contained none of the fields defined in `expectedFields`
|
||||
log.info(s"Expected Fields Not Present Tweet Id: $tweetId")
|
||||
statsReceiver.counter("expected_fields_not_present").incr()
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
val stateRecord = TweetStateRecord.mostRecent(mhRecords)
|
||||
stateRecord match {
|
||||
// some other cases don't require an attempt to construct a Tweet
|
||||
case Some(_: SoftDeleted) | Some(_: HardDeleted) => GetTweet.Response.Deleted
|
||||
|
||||
// all other cases require an attempt to construct a Tweet, which may not be successful
|
||||
case _ =>
|
||||
logRacyRead(tweetId, mhRecords)
|
||||
(stateRecord, tweetFromRecords(tweetId, mhRecords)) match {
|
||||
// BounceDeleted contains the Tweet data so that callers can access data on the the
|
||||
// tweet (e.g. hard delete daemon requires conversationId and userId. There are no
|
||||
// plans for Tweetypie server to make use of the returned tweet at this time.
|
||||
case (Some(_: BounceDeleted), Some(tweet)) => GetTweet.Response.BounceDeleted(tweet)
|
||||
case (Some(_: BounceDeleted), None) => GetTweet.Response.Deleted
|
||||
case (_, Some(tweet)) => GetTweet.Response.Found(tweet)
|
||||
case _ => GetTweet.Response.NotFound
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def apply(read: ManhattanOperations.Read, statsReceiver: StatsReceiver): GetTweet = {
|
||||
|
||||
object stats {
|
||||
val getTweetScope = statsReceiver.scope("getTweet")
|
||||
val deniedCounter: Counter = getTweetScope.counter("mh_denied")
|
||||
val mhExceptionCounter: Counter = getTweetScope.counter("mh_exception")
|
||||
val nonFatalExceptionCounter: Counter = getTweetScope.counter("non_fatal_exception")
|
||||
val notFoundCounter: Counter = getTweetScope.counter("not_found")
|
||||
}
|
||||
|
||||
object mhGroup extends StitchSeqGroup[TweetId, Seq[TweetManhattanRecord]] {
|
||||
override def run(tweetIds: Seq[TweetId]): Stitch[Seq[Seq[TweetManhattanRecord]]] = {
|
||||
Stats.addWidthStat("getTweet", "tweetIds", tweetIds.size, statsReceiver)
|
||||
Stitch.traverse(tweetIds)(read(_))
|
||||
}
|
||||
}
|
||||
|
||||
tweetId =>
|
||||
if (tweetId <= 0) {
|
||||
Stitch.NotFound
|
||||
} else {
|
||||
Stitch
|
||||
.call(tweetId, mhGroup)
|
||||
.map(mhRecords => tweetResponseFromRecords(tweetId, mhRecords, stats.getTweetScope))
|
||||
.liftToTry
|
||||
.map {
|
||||
case Throw(mhException: DeniedManhattanException) =>
|
||||
stats.deniedCounter.incr()
|
||||
Throw(RateLimited("", mhException))
|
||||
|
||||
// Encountered some other Manhattan error
|
||||
case t @ Throw(_: ManhattanException) =>
|
||||
stats.mhExceptionCounter.incr()
|
||||
t
|
||||
|
||||
// Something else happened
|
||||
case t @ Throw(ex) =>
|
||||
stats.nonFatalExceptionCounter.incr()
|
||||
TweetUtils.log
|
||||
.warning(ex, s"Unhandled exception in GetTweetHandler for tweetId: $tweetId")
|
||||
t
|
||||
|
||||
case r @ Return(GetTweet.Response.NotFound) =>
|
||||
stats.notFoundCounter.incr()
|
||||
r
|
||||
|
||||
case r @ Return(_) => r
|
||||
}
|
||||
.lowerFromTry
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,153 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.stitch.Stitch
|
||||
import com.twitter.tweetypie.storage.TweetKey.LKey.ForceAddedStateKey
|
||||
import com.twitter.tweetypie.storage.TweetStorageClient.HardDeleteTweet
|
||||
import com.twitter.tweetypie.storage.TweetStorageClient.HardDeleteTweet.Response._
|
||||
import com.twitter.tweetypie.storage.TweetUtils._
|
||||
import com.twitter.util.Return
|
||||
import com.twitter.util.Throw
|
||||
import com.twitter.util.Time
|
||||
import com.twitter.util.Try
|
||||
|
||||
object HardDeleteTweetHandler {
|
||||
|
||||
/**
|
||||
* When a tweet is removed lkeys with these prefixes will be deleted permanently.
|
||||
*/
|
||||
private[storage] def isKeyToBeDeleted(key: TweetKey): Boolean =
|
||||
key.lKey match {
|
||||
case (TweetKey.LKey.CoreFieldsKey | TweetKey.LKey.InternalFieldsKey(_) |
|
||||
TweetKey.LKey.AdditionalFieldsKey(_) | TweetKey.LKey.SoftDeletionStateKey |
|
||||
TweetKey.LKey.BounceDeletionStateKey | TweetKey.LKey.UnDeletionStateKey |
|
||||
TweetKey.LKey.ForceAddedStateKey) =>
|
||||
true
|
||||
case _ => false
|
||||
}
|
||||
|
||||
/**
|
||||
* When hard deleting, there are two actions, writing the record and
|
||||
* removing the tweet data. If we are performing any action, we will
|
||||
* always try to remove the tweet data. If the tweet does not yet have a
|
||||
* hard deletion record, then we will need to write one. This method
|
||||
* returns the HardDeleted record if it needs to be written, and None
|
||||
* if it has already been written.
|
||||
*
|
||||
* If the tweet is not in a deleted state we signal this with a
|
||||
* Throw(NotDeleted).
|
||||
*/
|
||||
private[storage] def getHardDeleteStateRecord(
|
||||
tweetId: TweetId,
|
||||
records: Seq[TweetManhattanRecord],
|
||||
mhTimestamp: Time,
|
||||
stats: StatsReceiver
|
||||
): Try[Option[TweetStateRecord.HardDeleted]] = {
|
||||
val mostRecent = TweetStateRecord.mostRecent(records)
|
||||
val currentStateStr = mostRecent.map(_.name).getOrElse("no_tweet_state_record")
|
||||
stats.counter(currentStateStr).incr()
|
||||
|
||||
mostRecent match {
|
||||
case Some(
|
||||
record @ (TweetStateRecord.SoftDeleted(_, _) | TweetStateRecord.BounceDeleted(_, _))) =>
|
||||
Return(
|
||||
Some(
|
||||
TweetStateRecord.HardDeleted(
|
||||
tweetId = tweetId,
|
||||
// createdAt is the hard deletion timestamp when dealing with hard deletes in Manhattan
|
||||
createdAt = mhTimestamp.inMillis,
|
||||
// deletedAt is the soft deletion timestamp when dealing with hard deletes in Manhattan
|
||||
deletedAt = record.createdAt
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
case Some(_: TweetStateRecord.HardDeleted) =>
|
||||
Return(None)
|
||||
|
||||
case Some(_: TweetStateRecord.ForceAdded) =>
|
||||
Throw(NotDeleted(tweetId, Some(ForceAddedStateKey)))
|
||||
|
||||
case Some(_: TweetStateRecord.Undeleted) =>
|
||||
Throw(NotDeleted(tweetId, Some(TweetKey.LKey.UnDeletionStateKey)))
|
||||
|
||||
case None =>
|
||||
Throw(NotDeleted(tweetId, None))
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This handler returns HardDeleteTweet.Response.Deleted if data associated with the tweet is deleted,
|
||||
* either as a result of this request or a previous one.
|
||||
*
|
||||
* The most recently added record determines the tweet's state. This method will only delete data
|
||||
* for tweets in the soft-delete or hard-delete state. (Calling hardDeleteTweet for tweets that have
|
||||
* already been hard-deleted will remove any lkeys that may not have been deleted previously).
|
||||
*/
|
||||
def apply(
|
||||
read: ManhattanOperations.Read,
|
||||
insert: ManhattanOperations.Insert,
|
||||
delete: ManhattanOperations.Delete,
|
||||
scribe: Scribe,
|
||||
stats: StatsReceiver
|
||||
): TweetId => Stitch[HardDeleteTweet.Response] = {
|
||||
val hardDeleteStats = stats.scope("hardDeleteTweet")
|
||||
val hardDeleteTweetCancelled = hardDeleteStats.counter("cancelled")
|
||||
val beforeStateStats = hardDeleteStats.scope("before_state")
|
||||
|
||||
def removeRecords(keys: Seq[TweetKey], mhTimestamp: Time): Stitch[Unit] =
|
||||
Stitch
|
||||
.collect(keys.map(key => delete(key, Some(mhTimestamp)).liftToTry))
|
||||
.map(collectWithRateLimitCheck)
|
||||
.lowerFromTry
|
||||
|
||||
def writeRecord(record: Option[TweetStateRecord.HardDeleted]): Stitch[Unit] =
|
||||
record match {
|
||||
case Some(r) =>
|
||||
insert(r.toTweetMhRecord).onSuccess { _ =>
|
||||
scribe.logRemoved(
|
||||
r.tweetId,
|
||||
Time.fromMilliseconds(r.createdAt),
|
||||
isSoftDeleted = false
|
||||
)
|
||||
}
|
||||
case None => Stitch.Unit
|
||||
}
|
||||
|
||||
tweetId =>
|
||||
read(tweetId)
|
||||
.flatMap { records =>
|
||||
val hardDeletionTimestamp = Time.now
|
||||
|
||||
val keysToBeDeleted: Seq[TweetKey] = records.map(_.key).filter(isKeyToBeDeleted)
|
||||
|
||||
getHardDeleteStateRecord(
|
||||
tweetId,
|
||||
records,
|
||||
hardDeletionTimestamp,
|
||||
beforeStateStats) match {
|
||||
case Return(record) =>
|
||||
Stitch
|
||||
.join(
|
||||
writeRecord(record),
|
||||
removeRecords(keysToBeDeleted, hardDeletionTimestamp)
|
||||
).map(_ =>
|
||||
// If the tweetId is non-snowflake and has previously been hard deleted
|
||||
// there will be no coreData record to fall back on to get the tweet
|
||||
// creation time and createdAtMillis will be None.
|
||||
Deleted(
|
||||
// deletedAtMillis: when the tweet was hard deleted
|
||||
deletedAtMillis = Some(hardDeletionTimestamp.inMillis),
|
||||
// createdAtMillis: when the tweet itself was created
|
||||
// (as opposed to when the deletion record was created)
|
||||
createdAtMillis =
|
||||
TweetUtils.creationTimeFromTweetIdOrMHRecords(tweetId, records)
|
||||
))
|
||||
case Throw(notDeleted: NotDeleted) =>
|
||||
hardDeleteTweetCancelled.incr()
|
||||
Stitch.value(notDeleted)
|
||||
case Throw(e) => Stitch.exception(e) // this should never happen
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,228 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.google.common.base.CaseFormat
|
||||
import com.twitter.conversions.DurationOps._
|
||||
import com.twitter.finagle.mtls.authentication.ServiceIdentifier
|
||||
import com.twitter.scrooge.TFieldBlob
|
||||
import com.twitter.scrooge.ThriftStructFieldInfo
|
||||
import com.twitter.stitch.Stitch
|
||||
import com.twitter.storage.client.manhattan.kv._
|
||||
import com.twitter.tweetypie.additionalfields.AdditionalFields
|
||||
import com.twitter.tweetypie.storage.ManhattanOperations.Read
|
||||
import com.twitter.tweetypie.storage.TweetUtils._
|
||||
import com.twitter.tweetypie.storage_internal.thriftscala.StoredTweet
|
||||
import com.twitter.tweetypie.thriftscala.{Tweet => TweetypieTweet}
|
||||
import com.twitter.util.Duration
|
||||
import com.twitter.util.Future
|
||||
import com.twitter.util.Return
|
||||
import com.twitter.util.Throw
|
||||
import diffshow.Container
|
||||
import diffshow.DiffShow
|
||||
import diffshow.Expr
|
||||
import org.apache.commons.codec.binary.Base64
|
||||
import scala.util.Try
|
||||
import shapeless.Cached
|
||||
import shapeless.Strict
|
||||
|
||||
// This class is used by the Tweetypie Console to inspect tweet field content in Manhattan
|
||||
class InspectFields(svcIdentifier: ServiceIdentifier) {
|
||||
val mhApplicationId = "tbird_mh"
|
||||
val mhDatasetName = "tbird_mh"
|
||||
val mhDestinationName = "/s/manhattan/cylon.native-thrift"
|
||||
val mhTimeout: Duration = 5000.milliseconds
|
||||
|
||||
val localMhEndpoint: ManhattanKVEndpoint =
|
||||
ManhattanKVEndpointBuilder(
|
||||
ManhattanKVClient(
|
||||
mhApplicationId,
|
||||
mhDestinationName,
|
||||
ManhattanKVClientMtlsParams(svcIdentifier)))
|
||||
.defaultGuarantee(Guarantee.SoftDcReadMyWrites)
|
||||
.defaultMaxTimeout(mhTimeout)
|
||||
.build()
|
||||
|
||||
val readOperation: Read = (new ManhattanOperations(mhDatasetName, localMhEndpoint)).read
|
||||
|
||||
def lookup(tweetId: Long): Future[String] = {
|
||||
val result = readOperation(tweetId).liftToTry.map {
|
||||
case Return(mhRecords) =>
|
||||
prettyPrintManhattanRecords(tweetId, TweetKey.padTweetIdStr(tweetId), mhRecords)
|
||||
case Throw(e) => e.toString
|
||||
}
|
||||
|
||||
Stitch.run(result)
|
||||
}
|
||||
|
||||
def storedTweet(tweetId: Long): Future[StoredTweet] = {
|
||||
val result = readOperation(tweetId).liftToTry.map {
|
||||
case Return(mhRecords) =>
|
||||
buildStoredTweet(tweetId, mhRecords)
|
||||
case Throw(e) =>
|
||||
throw e
|
||||
}
|
||||
|
||||
Stitch.run(result)
|
||||
}
|
||||
|
||||
private[this] def prettyPrintManhattanRecords(
|
||||
tweetId: Long,
|
||||
pkey: String,
|
||||
mhRecords: Seq[TweetManhattanRecord]
|
||||
): String = {
|
||||
if (mhRecords.isEmpty) {
|
||||
"Not Found"
|
||||
} else {
|
||||
val formattedRecords = getFormattedManhattanRecords(tweetId, mhRecords)
|
||||
val keyFieldWidth = formattedRecords.map(_.key.length).max + 2
|
||||
val fieldNameFieldWidth = formattedRecords.map(_.fieldName.length).max + 2
|
||||
|
||||
val formatString = s" %-${keyFieldWidth}s %-${fieldNameFieldWidth}s %s"
|
||||
|
||||
val recordsString =
|
||||
formattedRecords
|
||||
.map { record =>
|
||||
val content = record.content.replaceAll("\n", "\n" + formatString.format("", "", ""))
|
||||
formatString.format(record.key, record.fieldName, content)
|
||||
}
|
||||
.mkString("\n")
|
||||
|
||||
"/tbird_mh/" + pkey + "/" + "\n" + recordsString
|
||||
}
|
||||
}
|
||||
|
||||
private[this] def getFormattedManhattanRecords(
|
||||
tweetId: Long,
|
||||
mhRecords: Seq[TweetManhattanRecord]
|
||||
): Seq[FormattedManhattanRecord] = {
|
||||
val storedTweet = buildStoredTweet(tweetId, mhRecords).copy(updatedAt = None)
|
||||
val tweetypieTweet: Option[TweetypieTweet] =
|
||||
Try(StorageConversions.fromStoredTweet(storedTweet)).toOption
|
||||
|
||||
val blobMap: Map[String, TFieldBlob] = getStoredTweetBlobs(mhRecords).map { blob =>
|
||||
getFieldName(blob.field.id) -> blob
|
||||
}.toMap
|
||||
|
||||
mhRecords
|
||||
.map {
|
||||
case TweetManhattanRecord(fullKey, mhValue) =>
|
||||
FormattedManhattanRecord(
|
||||
key = fullKey.lKey.toString,
|
||||
fieldName = getFieldName(fullKey.lKey),
|
||||
content = prettyPrintManhattanValue(
|
||||
fullKey.lKey,
|
||||
mhValue,
|
||||
storedTweet,
|
||||
tweetypieTweet,
|
||||
tweetId,
|
||||
blobMap
|
||||
)
|
||||
)
|
||||
}
|
||||
.sortBy(_.key.replace("external", "xternal")) // sort by key, with internal first
|
||||
}
|
||||
|
||||
private[this] def getFieldNameFromThrift(
|
||||
fieldId: Short,
|
||||
fieldInfos: List[ThriftStructFieldInfo]
|
||||
): String =
|
||||
fieldInfos
|
||||
.find(info => info.tfield.id == fieldId)
|
||||
.map(_.tfield.name)
|
||||
.getOrElse("<UNKNOWN FIELD>")
|
||||
|
||||
private[this] def isLkeyScrubbedField(lkey: String): Boolean =
|
||||
lkey.split("/")(1) == "scrubbed_fields"
|
||||
|
||||
private[this] def getFieldName(lkey: TweetKey.LKey): String =
|
||||
lkey match {
|
||||
case fieldKey: TweetKey.LKey.FieldKey => getFieldName(fieldKey.fieldId)
|
||||
case _ => ""
|
||||
}
|
||||
|
||||
private[this] def getFieldName(fieldId: Short): String =
|
||||
if (fieldId == 1) {
|
||||
"core_fields"
|
||||
} else if (AdditionalFields.isAdditionalFieldId(fieldId)) {
|
||||
getFieldNameFromThrift(fieldId, TweetypieTweet.fieldInfos)
|
||||
} else {
|
||||
getFieldNameFromThrift(fieldId, StoredTweet.fieldInfos)
|
||||
}
|
||||
|
||||
private[this] def prettyPrintManhattanValue(
|
||||
lkey: TweetKey.LKey,
|
||||
mhValue: TweetManhattanValue,
|
||||
storedTweet: StoredTweet,
|
||||
tweetypieTweet: Option[TweetypieTweet],
|
||||
tweetId: Long,
|
||||
tfieldBlobs: Map[String, TFieldBlob]
|
||||
): String = {
|
||||
val decoded = lkey match {
|
||||
case _: TweetKey.LKey.MetadataKey =>
|
||||
decodeMetadata(mhValue)
|
||||
|
||||
case fieldKey: TweetKey.LKey.FieldKey =>
|
||||
tfieldBlobs
|
||||
.get(getFieldName(fieldKey.fieldId))
|
||||
.map(blob => decodeField(tweetId, blob, storedTweet, tweetypieTweet))
|
||||
|
||||
case _ =>
|
||||
None
|
||||
}
|
||||
|
||||
decoded.getOrElse { // If all else fails, encode the data as a base64 string
|
||||
val contents = mhValue.contents.array
|
||||
if (contents.isEmpty) {
|
||||
"<NO DATA>"
|
||||
} else {
|
||||
Base64.encodeBase64String(contents)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private[this] def decodeMetadata(mhValue: TweetManhattanValue): Option[String] = {
|
||||
val byteArray = ByteArrayCodec.fromByteBuffer(mhValue.contents)
|
||||
Try(Json.decode(byteArray).toString).toOption
|
||||
}
|
||||
|
||||
private[this] def decodeField(
|
||||
tweetId: Long,
|
||||
blob: TFieldBlob,
|
||||
storedTweet: StoredTweet,
|
||||
tweetypieTweet: Option[TweetypieTweet]
|
||||
): String = {
|
||||
val fieldId = blob.field.id
|
||||
|
||||
if (fieldId == 1) {
|
||||
coreFields(storedTweet)
|
||||
} else if (AdditionalFields.isAdditionalFieldId(fieldId)) {
|
||||
decodeTweetWithOneField(TweetypieTweet(tweetId).setField(blob))
|
||||
} else {
|
||||
decodeTweetWithOneField(StoredTweet(tweetId).setField(blob))
|
||||
}
|
||||
}
|
||||
|
||||
// Takes a Tweet or StoredTweet with a single field set and returns the value of that field
|
||||
private[this] def decodeTweetWithOneField[T](
|
||||
tweetWithOneField: T
|
||||
)(
|
||||
implicit ev: Cached[Strict[DiffShow[T]]]
|
||||
): String = {
|
||||
val config = diffshow.Config(hideFieldWithEmptyVal = true)
|
||||
val tree: Expr = config.transform(DiffShow.show(tweetWithOneField))
|
||||
|
||||
// matches a Tweet or StoredTweet with two values, the first being the id
|
||||
val value = tree.transform {
|
||||
case Container(_, List(diffshow.Field("id", _), diffshow.Field(_, value))) => value
|
||||
}
|
||||
|
||||
config.exprPrinter.apply(value, width = 80).render
|
||||
}
|
||||
|
||||
private[this] def coreFields(storedTweet: StoredTweet): String =
|
||||
diffshow.show(CoreFieldsCodec.fromTweet(storedTweet), hideFieldWithEmptyVal = true)
|
||||
|
||||
private[this] def toCamelCase(s: String): String =
|
||||
CaseFormat.LOWER_UNDERSCORE.to(CaseFormat.LOWER_CAMEL, s)
|
||||
}
|
||||
|
||||
case class FormattedManhattanRecord(key: String, fieldName: String, content: String)
|
|
@ -0,0 +1,17 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import com.fasterxml.jackson.module.scala.DefaultScalaModule
|
||||
|
||||
object Json {
|
||||
val TimestampKey = "timestamp"
|
||||
val SoftDeleteTimestampKey = "softdelete_timestamp"
|
||||
|
||||
private val mapper = new ObjectMapper
|
||||
mapper.registerModule(DefaultScalaModule)
|
||||
|
||||
def encode(m: Map[String, Any]): Array[Byte] = mapper.writeValueAsBytes(m)
|
||||
|
||||
def decode(arr: Array[Byte]): Map[String, Any] =
|
||||
mapper.readValue[Map[String, Any]](arr, classOf[Map[String, Any]])
|
||||
}
|
|
@ -0,0 +1,103 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.bijection.Injection
|
||||
import com.twitter.io.Buf
|
||||
import com.twitter.stitch.Stitch
|
||||
import com.twitter.storage.client.manhattan.bijections.Bijections.BufInjection
|
||||
import com.twitter.storage.client.manhattan.kv.ManhattanKVEndpoint
|
||||
import com.twitter.storage.client.manhattan.kv.impl.DescriptorP1L1
|
||||
import com.twitter.storage.client.manhattan.kv.impl.Component
|
||||
import com.twitter.storage.client.manhattan.kv.{impl => mh}
|
||||
import com.twitter.storage.client.manhattan.bijections.Bijections.StringInjection
|
||||
import com.twitter.util.Time
|
||||
import java.nio.ByteBuffer
|
||||
import scala.util.control.NonFatal
|
||||
|
||||
case class TweetManhattanRecord(key: TweetKey, value: TweetManhattanValue) {
|
||||
def pkey: TweetId = key.tweetId
|
||||
def lkey: TweetKey.LKey = key.lKey
|
||||
|
||||
/**
|
||||
* Produces a representation that is human-readable, but contains
|
||||
* all of the information from the record. It is not intended for
|
||||
* producing machine-readable values.
|
||||
*
|
||||
* This conversion is relatively expensive, so beware of using it in
|
||||
* hot code paths.
|
||||
*/
|
||||
override def toString: String = {
|
||||
val valueString =
|
||||
try {
|
||||
key.lKey match {
|
||||
case _: TweetKey.LKey.MetadataKey =>
|
||||
StringCodec.fromByteBuffer(value.contents)
|
||||
|
||||
case _: TweetKey.LKey.FieldKey =>
|
||||
val tFieldBlob = TFieldBlobCodec.fromByteBuffer(value.contents)
|
||||
s"TFieldBlob(${tFieldBlob.field}, 0x${Buf.slowHexString(tFieldBlob.content)})"
|
||||
|
||||
case TweetKey.LKey.Unknown(_) =>
|
||||
"0x" + Buf.slowHexString(Buf.ByteBuffer.Shared(value.contents))
|
||||
}
|
||||
} catch {
|
||||
case NonFatal(e) =>
|
||||
val hexValue = Buf.slowHexString(Buf.ByteBuffer.Shared(value.contents))
|
||||
s"0x$hexValue (failed to decode due to $e)"
|
||||
}
|
||||
|
||||
s"$key => ${value.copy(contents = valueString)}"
|
||||
}
|
||||
}
|
||||
|
||||
object ManhattanOperations {
|
||||
type Read = TweetId => Stitch[Seq[TweetManhattanRecord]]
|
||||
type Insert = TweetManhattanRecord => Stitch[Unit]
|
||||
type Delete = (TweetKey, Option[Time]) => Stitch[Unit]
|
||||
type DeleteRange = TweetId => Stitch[Unit]
|
||||
|
||||
object PkeyInjection extends Injection[TweetId, String] {
|
||||
override def apply(tweetId: TweetId): String = TweetKey.padTweetIdStr(tweetId)
|
||||
override def invert(str: String): scala.util.Try[TweetId] = scala.util.Try(str.toLong)
|
||||
}
|
||||
|
||||
case class InvalidLkey(lkeyStr: String) extends Exception
|
||||
|
||||
object LkeyInjection extends Injection[TweetKey.LKey, String] {
|
||||
override def apply(lkey: TweetKey.LKey): String = lkey.toString
|
||||
override def invert(str: String): scala.util.Try[TweetKey.LKey] =
|
||||
scala.util.Success(TweetKey.LKey.fromString(str))
|
||||
}
|
||||
|
||||
val KeyDescriptor: DescriptorP1L1.EmptyKey[TweetId, TweetKey.LKey] =
|
||||
mh.KeyDescriptor(
|
||||
Component(PkeyInjection.andThen(StringInjection)),
|
||||
Component(LkeyInjection.andThen(StringInjection))
|
||||
)
|
||||
|
||||
val ValueDescriptor: mh.ValueDescriptor.EmptyValue[ByteBuffer] = mh.ValueDescriptor(BufInjection)
|
||||
}
|
||||
|
||||
class ManhattanOperations(dataset: String, mhEndpoint: ManhattanKVEndpoint) {
|
||||
import ManhattanOperations._
|
||||
|
||||
private[this] def pkey(tweetId: TweetId) = KeyDescriptor.withDataset(dataset).withPkey(tweetId)
|
||||
|
||||
def read: Read = { tweetId =>
|
||||
mhEndpoint.slice(pkey(tweetId).under(), ValueDescriptor).map { mhData =>
|
||||
mhData.map {
|
||||
case (key, value) => TweetManhattanRecord(TweetKey(key.pkey, key.lkey), value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def insert: Insert =
|
||||
record => {
|
||||
val mhKey = pkey(record.key.tweetId).withLkey(record.key.lKey)
|
||||
mhEndpoint.insert(mhKey, ValueDescriptor.withValue(record.value))
|
||||
}
|
||||
|
||||
def delete: Delete = (key, time) => mhEndpoint.delete(pkey(key.tweetId).withLkey(key.lKey), time)
|
||||
|
||||
def deleteRange: DeleteRange =
|
||||
tweetId => mhEndpoint.deleteRange(KeyDescriptor.withDataset(dataset).withPkey(tweetId).under())
|
||||
}
|
|
@ -0,0 +1,451 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.conversions.DurationOps._
|
||||
import com.twitter.finagle.mtls.authentication.EmptyServiceIdentifier
|
||||
import com.twitter.finagle.mtls.authentication.ServiceIdentifier
|
||||
import com.twitter.finagle.ssl.OpportunisticTls
|
||||
import com.twitter.finagle.stats.NullStatsReceiver
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.logging.BareFormatter
|
||||
import com.twitter.logging.Level
|
||||
import com.twitter.logging.ScribeHandler
|
||||
import com.twitter.logging._
|
||||
import com.twitter.stitch.Stitch
|
||||
import com.twitter.storage.client.manhattan.bijections.Bijections._
|
||||
import com.twitter.storage.client.manhattan.kv._
|
||||
import com.twitter.storage.client.manhattan.kv.impl.ValueDescriptor
|
||||
import com.twitter.tweetypie.client_id.ClientIdHelper
|
||||
import com.twitter.tweetypie.storage.Scribe.ScribeHandlerFactory
|
||||
import com.twitter.tweetypie.storage.TweetStorageClient.BounceDelete
|
||||
import com.twitter.tweetypie.storage.TweetStorageClient.GetTweet
|
||||
import com.twitter.tweetypie.storage.TweetStorageClient.HardDeleteTweet
|
||||
import com.twitter.tweetypie.thriftscala.Tweet
|
||||
import com.twitter.tweetypie.util.StitchUtils
|
||||
import com.twitter.util.Duration
|
||||
import com.twitter.util.Return
|
||||
import com.twitter.util.Throw
|
||||
import scala.util.Random
|
||||
|
||||
object ManhattanTweetStorageClient {
|
||||
object Config {
|
||||
|
||||
/**
|
||||
* The Manhattan dataset where tweets are stored is not externally
|
||||
* configurable because writing tweets to a non-production dataset
|
||||
* requires great care. Staging instances using a different dataset will
|
||||
* write tweets to a non-production store, but will publish events, log to
|
||||
* HDFS, and cache data referencing tweets in that store which are not
|
||||
* accessible by the rest of the production cluster.
|
||||
*
|
||||
* In a completely isolated environment it should be safe to write to
|
||||
* other datasets for testing purposes.
|
||||
*/
|
||||
val Dataset = "tbird_mh"
|
||||
|
||||
/**
|
||||
* Once a tweet has been deleted it can only be undeleted within this time
|
||||
* window, after which [[UndeleteHandler]] will return an error on
|
||||
* undelete attempts.
|
||||
*/
|
||||
val UndeleteWindowHours = 240
|
||||
|
||||
/**
|
||||
* Default label used for underlying Manhattan Thrift client metrics
|
||||
*
|
||||
* The finagle client metrics will be exported at clnt/:label.
|
||||
*/
|
||||
val ThriftClientLabel = "mh_cylon"
|
||||
|
||||
/**
|
||||
* Return the corresponding Wily path for the Cylon cluster in the "other" DC
|
||||
*/
|
||||
def remoteDestination(zone: String): String =
|
||||
s"/srv#/prod/${remoteZone(zone)}/manhattan/cylon.native-thrift"
|
||||
|
||||
private def remoteZone(zone: String) = zone match {
|
||||
case "pdxa" => "atla"
|
||||
case "atla" | "localhost" => "pdxa"
|
||||
case _ =>
|
||||
throw new IllegalArgumentException(s"Cannot configure remote DC for unknown zone '$zone'")
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param applicationId Manhattan application id used for quota accounting
|
||||
* @param localDestination Wily path to local Manhattan cluster
|
||||
* @param localTimeout Overall timeout (including retries) for all reads/writes to local cluster
|
||||
* @param remoteDestination Wily path to remote Manhattan cluster, used for undelete and force add
|
||||
* @param remoteTimeout Overall timeout (including retries) for all reads/writes to remote cluster
|
||||
* @param undeleteWindowHours Amount of time during which a deleted tweet can be undeleted
|
||||
* @param thriftClientLabel Label used to scope stats for Manhattan Thrift client
|
||||
* @param maxRequestsPerBatch Configure the Stitch RequestGroup.Generator batch size
|
||||
* @param serviceIdentifier The ServiceIdentifier to use when making connections to a Manhattan cluster
|
||||
* @param opportunisticTlsLevel The level to use for opportunistic TLS for connections to the Manhattan cluster
|
||||
*/
|
||||
case class Config(
|
||||
applicationId: String,
|
||||
localDestination: String,
|
||||
localTimeout: Duration,
|
||||
remoteDestination: String,
|
||||
remoteTimeout: Duration,
|
||||
undeleteWindowHours: Int = Config.UndeleteWindowHours,
|
||||
thriftClientLabel: String = Config.ThriftClientLabel,
|
||||
maxRequestsPerBatch: Int = Int.MaxValue,
|
||||
serviceIdentifier: ServiceIdentifier,
|
||||
opportunisticTlsLevel: OpportunisticTls.Level)
|
||||
|
||||
/**
|
||||
* Sanitizes the input for APIs which take in a (Tweet, Seq[Field]) as input.
|
||||
*
|
||||
* NOTE: This function only applies sanity checks which are common to
|
||||
* all APIs which take in a (Tweet, Seq[Field]) as input. API specific
|
||||
* checks are not covered here.
|
||||
*
|
||||
* @param apiStitch the backing API call
|
||||
* @tparam T the output type of the backing API call
|
||||
* @return a stitch function which does some basic input sanity checking
|
||||
*/
|
||||
private[storage] def sanitizeTweetFields[T](
|
||||
apiStitch: (Tweet, Seq[Field]) => Stitch[T]
|
||||
): (Tweet, Seq[Field]) => Stitch[T] =
|
||||
(tweet, fields) => {
|
||||
require(fields.forall(_.id > 0), s"Field ids ${fields} are not positive numbers")
|
||||
apiStitch(tweet, fields)
|
||||
}
|
||||
|
||||
// Returns a handler that asynchronously logs messages to Scribe using the BareFormatter which
|
||||
// logs just the message without any additional metadata
|
||||
def scribeHandler(categoryName: String): HandlerFactory =
|
||||
ScribeHandler(
|
||||
formatter = BareFormatter,
|
||||
maxMessagesPerTransaction = 100,
|
||||
category = categoryName,
|
||||
level = Some(Level.TRACE)
|
||||
)
|
||||
|
||||
/**
|
||||
* A Config appropriate for interactive sessions and scripts.
|
||||
*/
|
||||
def develConfig(): Config =
|
||||
Config(
|
||||
applicationId = Option(System.getenv("USER")).getOrElse("<unknown>") + ".devel",
|
||||
localDestination = "/s/manhattan/cylon.native-thrift",
|
||||
localTimeout = 10.seconds,
|
||||
remoteDestination = "/s/manhattan/cylon.native-thrift",
|
||||
remoteTimeout = 10.seconds,
|
||||
undeleteWindowHours = Config.UndeleteWindowHours,
|
||||
thriftClientLabel = Config.ThriftClientLabel,
|
||||
maxRequestsPerBatch = Int.MaxValue,
|
||||
serviceIdentifier = ServiceIdentifier(System.getenv("USER"), "tweetypie", "devel", "local"),
|
||||
opportunisticTlsLevel = OpportunisticTls.Required
|
||||
)
|
||||
|
||||
/**
|
||||
* Build a Manhattan tweet storage client for use in interactive
|
||||
* sessions and scripts.
|
||||
*/
|
||||
def devel(): TweetStorageClient =
|
||||
new ManhattanTweetStorageClient(
|
||||
develConfig(),
|
||||
NullStatsReceiver,
|
||||
ClientIdHelper.default,
|
||||
)
|
||||
}
|
||||
|
||||
class ManhattanTweetStorageClient(
|
||||
config: ManhattanTweetStorageClient.Config,
|
||||
statsReceiver: StatsReceiver,
|
||||
private val clientIdHelper: ClientIdHelper)
|
||||
extends TweetStorageClient {
|
||||
import ManhattanTweetStorageClient._
|
||||
|
||||
lazy val scribeHandlerFactory: ScribeHandlerFactory = scribeHandler _
|
||||
val scribe: Scribe = new Scribe(scribeHandlerFactory, statsReceiver)
|
||||
|
||||
def mkClient(
|
||||
dest: String,
|
||||
label: String
|
||||
): ManhattanKVClient = {
|
||||
val mhMtlsParams =
|
||||
if (config.serviceIdentifier == EmptyServiceIdentifier) NoMtlsParams
|
||||
else
|
||||
ManhattanKVClientMtlsParams(
|
||||
serviceIdentifier = config.serviceIdentifier,
|
||||
opportunisticTls = config.opportunisticTlsLevel
|
||||
)
|
||||
|
||||
new ManhattanKVClient(
|
||||
config.applicationId,
|
||||
dest,
|
||||
mhMtlsParams,
|
||||
label,
|
||||
Seq(Experiments.ApertureLoadBalancer))
|
||||
}
|
||||
|
||||
val localClient: ManhattanKVClient = mkClient(config.localDestination, config.thriftClientLabel)
|
||||
|
||||
val localMhEndpoint: ManhattanKVEndpoint = ManhattanKVEndpointBuilder(localClient)
|
||||
.defaultGuarantee(Guarantee.SoftDcReadMyWrites)
|
||||
.defaultMaxTimeout(config.localTimeout)
|
||||
.maxRequestsPerBatch(config.maxRequestsPerBatch)
|
||||
.build()
|
||||
|
||||
val localManhattanOperations = new ManhattanOperations(Config.Dataset, localMhEndpoint)
|
||||
|
||||
val remoteClient: ManhattanKVClient =
|
||||
mkClient(config.remoteDestination, s"${config.thriftClientLabel}_remote")
|
||||
|
||||
val remoteMhEndpoint: ManhattanKVEndpoint = ManhattanKVEndpointBuilder(remoteClient)
|
||||
.defaultGuarantee(Guarantee.SoftDcReadMyWrites)
|
||||
.defaultMaxTimeout(config.remoteTimeout)
|
||||
.build()
|
||||
|
||||
val remoteManhattanOperations = new ManhattanOperations(Config.Dataset, remoteMhEndpoint)
|
||||
|
||||
/**
|
||||
* Note: This translation is only useful for non-batch endpoints. Batch endpoints currently
|
||||
* represent failure without propagating an exception
|
||||
* (e.g. [[com.twitter.tweetypie.storage.Response.TweetResponseCode.Failure]]).
|
||||
*/
|
||||
private[this] def translateExceptions(
|
||||
apiName: String,
|
||||
statsReceiver: StatsReceiver
|
||||
): PartialFunction[Throwable, Throwable] = {
|
||||
case e: IllegalArgumentException => ClientError(e.getMessage, e)
|
||||
case e: DeniedManhattanException => RateLimited(e.getMessage, e)
|
||||
case e: VersionMismatchError =>
|
||||
statsReceiver.scope(apiName).counter("mh_version_mismatches").incr()
|
||||
e
|
||||
case e: InternalError =>
|
||||
TweetUtils.log.error(e, s"Error processing $apiName request: ${e.getMessage}")
|
||||
e
|
||||
}
|
||||
|
||||
/**
|
||||
* Count requests per client id producing metrics of the form
|
||||
* .../clients/:root_client_id/requests
|
||||
*/
|
||||
def observeClientId[A, B](
|
||||
apiStitch: A => Stitch[B],
|
||||
statsReceiver: StatsReceiver,
|
||||
clientIdHelper: ClientIdHelper,
|
||||
): A => Stitch[B] = {
|
||||
val clients = statsReceiver.scope("clients")
|
||||
|
||||
val incrementClientRequests = { args: A =>
|
||||
val clientId = clientIdHelper.effectiveClientIdRoot.getOrElse(ClientIdHelper.UnknownClientId)
|
||||
clients.counter(clientId, "requests").incr
|
||||
}
|
||||
|
||||
a => {
|
||||
incrementClientRequests(a)
|
||||
apiStitch(a)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Increment counters based on the overall response status of the returned [[GetTweet.Response]].
|
||||
*/
|
||||
def observeGetTweetResponseCode[A](
|
||||
apiStitch: A => Stitch[GetTweet.Response],
|
||||
statsReceiver: StatsReceiver
|
||||
): A => Stitch[GetTweet.Response] = {
|
||||
val scope = statsReceiver.scope("response_code")
|
||||
|
||||
val success = scope.counter("success")
|
||||
val notFound = scope.counter("not_found")
|
||||
val failure = scope.counter("failure")
|
||||
val overCapacity = scope.counter("over_capacity")
|
||||
val deleted = scope.counter("deleted")
|
||||
val bounceDeleted = scope.counter("bounce_deleted")
|
||||
|
||||
a =>
|
||||
apiStitch(a).respond {
|
||||
case Return(_: GetTweet.Response.Found) => success.incr()
|
||||
case Return(GetTweet.Response.NotFound) => notFound.incr()
|
||||
case Return(_: GetTweet.Response.BounceDeleted) => bounceDeleted.incr()
|
||||
case Return(GetTweet.Response.Deleted) => deleted.incr()
|
||||
case Throw(_: RateLimited) => overCapacity.incr()
|
||||
case Throw(_) => failure.incr()
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* We do 3 things here:
|
||||
*
|
||||
* - Bookkeeping for overall requests
|
||||
* - Bookkeeping for per api requests
|
||||
* - Translate exceptions
|
||||
*
|
||||
* @param apiName the API being called
|
||||
* @param apiStitch the implementation of the API
|
||||
* @tparam A template for input type of API
|
||||
* @tparam B template for output type of API
|
||||
* @return Function which executes the given API call
|
||||
*/
|
||||
private[storage] def endpoint[A, B](
|
||||
apiName: String,
|
||||
apiStitch: A => Stitch[B]
|
||||
): A => Stitch[B] = {
|
||||
val translateException = translateExceptions(apiName, statsReceiver)
|
||||
val observe = StitchUtils.observe[B](statsReceiver, apiName)
|
||||
|
||||
a =>
|
||||
StitchUtils.translateExceptions(
|
||||
observe(apiStitch(a)),
|
||||
translateException
|
||||
)
|
||||
}
|
||||
|
||||
private[storage] def endpoint2[A, B, C](
|
||||
apiName: String,
|
||||
apiStitch: (A, B) => Stitch[C],
|
||||
clientIdHelper: ClientIdHelper,
|
||||
): (A, B) => Stitch[C] =
|
||||
Function.untupled(endpoint(apiName, apiStitch.tupled))
|
||||
|
||||
val getTweet: TweetStorageClient.GetTweet = {
|
||||
val stats = statsReceiver.scope("getTweet")
|
||||
|
||||
observeClientId(
|
||||
observeGetTweetResponseCode(
|
||||
endpoint(
|
||||
"getTweet",
|
||||
GetTweetHandler(
|
||||
read = localManhattanOperations.read,
|
||||
statsReceiver = stats,
|
||||
)
|
||||
),
|
||||
stats,
|
||||
),
|
||||
stats,
|
||||
clientIdHelper,
|
||||
)
|
||||
}
|
||||
|
||||
val getStoredTweet: TweetStorageClient.GetStoredTweet = {
|
||||
val stats = statsReceiver.scope("getStoredTweet")
|
||||
|
||||
observeClientId(
|
||||
endpoint(
|
||||
"getStoredTweet",
|
||||
GetStoredTweetHandler(
|
||||
read = localManhattanOperations.read,
|
||||
statsReceiver = stats,
|
||||
)
|
||||
),
|
||||
stats,
|
||||
clientIdHelper,
|
||||
)
|
||||
}
|
||||
|
||||
val addTweet: TweetStorageClient.AddTweet =
|
||||
endpoint(
|
||||
"addTweet",
|
||||
AddTweetHandler(
|
||||
insert = localManhattanOperations.insert,
|
||||
scribe = scribe,
|
||||
stats = statsReceiver
|
||||
)
|
||||
)
|
||||
|
||||
val updateTweet: TweetStorageClient.UpdateTweet =
|
||||
endpoint2(
|
||||
"updateTweet",
|
||||
ManhattanTweetStorageClient.sanitizeTweetFields(
|
||||
UpdateTweetHandler(
|
||||
insert = localManhattanOperations.insert,
|
||||
stats = statsReceiver,
|
||||
)
|
||||
),
|
||||
clientIdHelper,
|
||||
)
|
||||
|
||||
val softDelete: TweetStorageClient.SoftDelete =
|
||||
endpoint(
|
||||
"softDelete",
|
||||
SoftDeleteHandler(
|
||||
insert = localManhattanOperations.insert,
|
||||
scribe = scribe
|
||||
)
|
||||
)
|
||||
|
||||
val bounceDelete: BounceDelete =
|
||||
endpoint(
|
||||
"bounceDelete",
|
||||
BounceDeleteHandler(
|
||||
insert = localManhattanOperations.insert,
|
||||
scribe = scribe
|
||||
)
|
||||
)
|
||||
|
||||
val undelete: TweetStorageClient.Undelete =
|
||||
endpoint(
|
||||
"undelete",
|
||||
UndeleteHandler(
|
||||
read = localManhattanOperations.read,
|
||||
localInsert = localManhattanOperations.insert,
|
||||
remoteInsert = remoteManhattanOperations.insert,
|
||||
delete = localManhattanOperations.delete,
|
||||
undeleteWindowHours = config.undeleteWindowHours,
|
||||
stats = statsReceiver
|
||||
)
|
||||
)
|
||||
|
||||
val getDeletedTweets: TweetStorageClient.GetDeletedTweets =
|
||||
endpoint(
|
||||
"getDeletedTweets",
|
||||
GetDeletedTweetsHandler(
|
||||
read = localManhattanOperations.read,
|
||||
stats = statsReceiver
|
||||
)
|
||||
)
|
||||
|
||||
val deleteAdditionalFields: TweetStorageClient.DeleteAdditionalFields =
|
||||
endpoint2(
|
||||
"deleteAdditionalFields",
|
||||
DeleteAdditionalFieldsHandler(
|
||||
delete = localManhattanOperations.delete,
|
||||
stats = statsReceiver,
|
||||
),
|
||||
clientIdHelper,
|
||||
)
|
||||
|
||||
val scrub: TweetStorageClient.Scrub =
|
||||
endpoint2(
|
||||
"scrub",
|
||||
ScrubHandler(
|
||||
insert = localManhattanOperations.insert,
|
||||
delete = localManhattanOperations.delete,
|
||||
scribe = scribe,
|
||||
stats = statsReceiver,
|
||||
),
|
||||
clientIdHelper,
|
||||
)
|
||||
|
||||
val hardDeleteTweet: HardDeleteTweet =
|
||||
endpoint(
|
||||
"hardDeleteTweet",
|
||||
HardDeleteTweetHandler(
|
||||
read = localManhattanOperations.read,
|
||||
insert = localManhattanOperations.insert,
|
||||
delete = localManhattanOperations.delete,
|
||||
scribe = scribe,
|
||||
stats = statsReceiver
|
||||
)
|
||||
)
|
||||
|
||||
val ping: TweetStorageClient.Ping =
|
||||
() =>
|
||||
Stitch
|
||||
.run(
|
||||
localMhEndpoint
|
||||
.get(
|
||||
ManhattanOperations.KeyDescriptor
|
||||
.withDataset(Config.Dataset)
|
||||
.withPkey(Random.nextLong().abs)
|
||||
.withLkey(TweetKey.LKey.CoreFieldsKey), // could be any lkey
|
||||
ValueDescriptor(BufInjection)
|
||||
).unit
|
||||
)
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
object Response {
|
||||
case class TweetResponse(
|
||||
tweetId: Long,
|
||||
overallResponse: TweetResponseCode,
|
||||
additionalFieldResponses: Option[Map[Short, FieldResponse]] = None)
|
||||
|
||||
sealed trait TweetResponseCode
|
||||
|
||||
object TweetResponseCode {
|
||||
object Success extends TweetResponseCode
|
||||
object Partial extends TweetResponseCode
|
||||
object Failure extends TweetResponseCode
|
||||
object OverCapacity extends TweetResponseCode
|
||||
object Deleted extends TweetResponseCode
|
||||
}
|
||||
|
||||
case class FieldResponse(code: FieldResponseCode, message: Option[String] = None)
|
||||
|
||||
sealed trait FieldResponseCode
|
||||
|
||||
object FieldResponseCode {
|
||||
object Success extends FieldResponseCode
|
||||
object InvalidRequest extends FieldResponseCode
|
||||
object ValueNotFound extends FieldResponseCode
|
||||
object Timeout extends FieldResponseCode
|
||||
object Error extends FieldResponseCode
|
||||
}
|
||||
}
|
|
@ -0,0 +1,85 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.servo.util.FutureEffect
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.logging._
|
||||
import com.twitter.scrooge.BinaryThriftStructSerializer
|
||||
import com.twitter.servo.util.{Scribe => ServoScribe}
|
||||
import com.twitter.tweetypie.storage_internal.thriftscala._
|
||||
import com.twitter.tbird.thriftscala.Added
|
||||
import com.twitter.tbird.thriftscala.Removed
|
||||
import com.twitter.tbird.thriftscala.Scrubbed
|
||||
import com.twitter.util.Time
|
||||
|
||||
/**
|
||||
* Scribe is used to log tweet writes which are used to generate /tables/statuses in HDFS.
|
||||
*
|
||||
* Write Scribe Category Message
|
||||
* ----- --------------- -------
|
||||
* add tbird_add_status [[com.twitter.tbird.thriftscala.Added]]
|
||||
* remove tbird_remove_status [[com.twitter.tbird.thriftscala.Removed]]
|
||||
* scrub tbird_scrub_status [[com.twitter.tbird.thriftscala.Scrubbed]]
|
||||
*
|
||||
* The thrift representation is encoded using binary thrift protocol format, followed by base64
|
||||
* encoding and converted to string using default character set (utf8). The logger uses BareFormatter.
|
||||
*
|
||||
* The thrift ops are scribed only after the write API call has succeeded.
|
||||
*
|
||||
* The class is thread safe except initial configuration and registration routines,
|
||||
* and no exception is expected unless java heap is out of memory.
|
||||
*
|
||||
* If exception does get thrown, add/remove/scrub operations will fail and
|
||||
* client will have to retry
|
||||
*/
|
||||
class Scribe(factory: Scribe.ScribeHandlerFactory, statsReceiver: StatsReceiver) {
|
||||
import Scribe._
|
||||
|
||||
private val AddedSerializer = BinaryThriftStructSerializer(Added)
|
||||
private val RemovedSerializer = BinaryThriftStructSerializer(Removed)
|
||||
private val ScrubbedSerializer = BinaryThriftStructSerializer(Scrubbed)
|
||||
|
||||
private val addCounter = statsReceiver.counter("scribe/add/count")
|
||||
private val removeCounter = statsReceiver.counter("scribe/remove/count")
|
||||
private val scrubCounter = statsReceiver.counter("scribe/scrub/count")
|
||||
|
||||
val addHandler: FutureEffect[String] = ServoScribe(factory(scribeAddedCategory)())
|
||||
val removeHandler: FutureEffect[String] = ServoScribe(factory(scribeRemovedCategory)())
|
||||
val scrubHandler: FutureEffect[String] = ServoScribe(factory(scribeScrubbedCategory)())
|
||||
|
||||
private def addedToString(tweet: StoredTweet): String =
|
||||
AddedSerializer.toString(
|
||||
Added(StatusConversions.toTBirdStatus(tweet), Time.now.inMilliseconds, Some(false))
|
||||
)
|
||||
|
||||
private def removedToString(id: Long, at: Time, isSoftDeleted: Boolean): String =
|
||||
RemovedSerializer.toString(Removed(id, at.inMilliseconds, Some(isSoftDeleted)))
|
||||
|
||||
private def scrubbedToString(id: Long, cols: Seq[Int], at: Time): String =
|
||||
ScrubbedSerializer.toString(Scrubbed(id, cols, at.inMilliseconds))
|
||||
|
||||
def logAdded(tweet: StoredTweet): Unit = {
|
||||
addHandler(addedToString(tweet))
|
||||
addCounter.incr()
|
||||
}
|
||||
|
||||
def logRemoved(id: Long, at: Time, isSoftDeleted: Boolean): Unit = {
|
||||
removeHandler(removedToString(id, at, isSoftDeleted))
|
||||
removeCounter.incr()
|
||||
}
|
||||
|
||||
def logScrubbed(id: Long, cols: Seq[Int], at: Time): Unit = {
|
||||
scrubHandler(scrubbedToString(id, cols, at))
|
||||
scrubCounter.incr()
|
||||
}
|
||||
}
|
||||
|
||||
object Scribe {
|
||||
type ScribeHandlerFactory = (String) => HandlerFactory
|
||||
|
||||
/** WARNING: These categories are white-listed. If you are changing them, the new categories should be white-listed.
|
||||
* You should followup with CoreWorkflows team (CW) for that.
|
||||
*/
|
||||
private val scribeAddedCategory = "tbird_add_status"
|
||||
private val scribeRemovedCategory = "tbird_remove_status"
|
||||
private val scribeScrubbedCategory = "tbird_scrub_status"
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.stitch.Stitch
|
||||
import com.twitter.storage.client.manhattan.kv.ManhattanValue
|
||||
import com.twitter.tweetypie.storage.TweetUtils._
|
||||
import com.twitter.util.Time
|
||||
|
||||
/**
|
||||
* Deletes data for the scrubbed field and writes a metadata record.
|
||||
* Provides scrub functionality. Right now, we only allow the scrubbing of the geo field.
|
||||
* It should be simple to add more fields to the allowlist if needed.
|
||||
*/
|
||||
object ScrubHandler {
|
||||
|
||||
val scrubFieldsAllowlist: Set[Field] = Set(Field.Geo)
|
||||
|
||||
def apply(
|
||||
insert: ManhattanOperations.Insert,
|
||||
delete: ManhattanOperations.Delete,
|
||||
scribe: Scribe,
|
||||
stats: StatsReceiver
|
||||
): TweetStorageClient.Scrub =
|
||||
(unfilteredTweetIds: Seq[TweetId], columns: Seq[Field]) => {
|
||||
val tweetIds = unfilteredTweetIds.filter(_ > 0)
|
||||
|
||||
require(columns.nonEmpty, "Must specify fields to scrub")
|
||||
require(
|
||||
columns.toSet.size == columns.size,
|
||||
s"Duplicate fields to scrub specified: $columns"
|
||||
)
|
||||
require(
|
||||
columns.forall(scrubFieldsAllowlist.contains(_)),
|
||||
s"Cannot scrub $columns; scrubbable fields are restricted to $scrubFieldsAllowlist"
|
||||
)
|
||||
|
||||
Stats.addWidthStat("scrub", "ids", tweetIds.size, stats)
|
||||
val mhTimestamp = Time.now
|
||||
|
||||
val stitches = tweetIds.map { tweetId =>
|
||||
val deletionStitches = columns.map { field =>
|
||||
val mhKeyToDelete = TweetKey.fieldKey(tweetId, field.id)
|
||||
delete(mhKeyToDelete, Some(mhTimestamp)).liftToTry
|
||||
}
|
||||
|
||||
val collectedStitch =
|
||||
Stitch.collect(deletionStitches).map(collectWithRateLimitCheck).lowerFromTry
|
||||
|
||||
collectedStitch
|
||||
.flatMap { _ =>
|
||||
val scrubbedStitches = columns.map { column =>
|
||||
val scrubbedKey = TweetKey.scrubbedFieldKey(tweetId, column.id)
|
||||
val record =
|
||||
TweetManhattanRecord(
|
||||
scrubbedKey,
|
||||
ManhattanValue(StringCodec.toByteBuffer(""), Some(mhTimestamp))
|
||||
)
|
||||
|
||||
insert(record).liftToTry
|
||||
}
|
||||
|
||||
Stitch.collect(scrubbedStitches)
|
||||
}
|
||||
.map(collectWithRateLimitCheck)
|
||||
}
|
||||
|
||||
Stitch.collect(stitches).map(collectWithRateLimitCheck).lowerFromTry.onSuccess { _ =>
|
||||
tweetIds.foreach { id => scribe.logScrubbed(id, columns.map(_.id.toInt), mhTimestamp) }
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.util.Time
|
||||
|
||||
object SoftDeleteHandler {
|
||||
def apply(
|
||||
insert: ManhattanOperations.Insert,
|
||||
scribe: Scribe
|
||||
): TweetStorageClient.SoftDelete =
|
||||
tweetId => {
|
||||
val mhTimestamp = Time.now
|
||||
val softDeleteRecord = TweetStateRecord
|
||||
.SoftDeleted(tweetId, mhTimestamp.inMillis)
|
||||
.toTweetMhRecord
|
||||
|
||||
insert(softDeleteRecord).onSuccess { _ =>
|
||||
scribe.logRemoved(tweetId, mhTimestamp, isSoftDeleted = true)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
|
||||
object Stats {
|
||||
// These two methods below (addWidthStat and updatePerFieldQpsCounters) are called per RPC call for most APIs,
|
||||
// so we rely on the stats receiver that is passed in to the library to do memoization.
|
||||
|
||||
private[storage] def addWidthStat(
|
||||
rpcName: String,
|
||||
paramName: String,
|
||||
width: Int,
|
||||
stats: StatsReceiver
|
||||
): Unit =
|
||||
getStat(rpcName, paramName, stats).add(width)
|
||||
|
||||
// Updates the counters for each Additional field. The idea here is to expose the QPS for each
|
||||
// additional field
|
||||
private[storage] def updatePerFieldQpsCounters(
|
||||
rpcName: String,
|
||||
fieldIds: Seq[FieldId],
|
||||
count: Int,
|
||||
stats: StatsReceiver
|
||||
): Unit = {
|
||||
fieldIds.foreach { fieldId => getCounter(rpcName, fieldId, stats).incr(count) }
|
||||
}
|
||||
|
||||
private def getCounter(rpcName: String, fieldId: FieldId, stats: StatsReceiver) =
|
||||
stats.scope(rpcName, "fields", fieldId.toString).counter("count")
|
||||
|
||||
private def getStat(rpcName: String, paramName: String, stats: StatsReceiver) =
|
||||
stats.scope(rpcName, paramName).stat("width")
|
||||
}
|
|
@ -0,0 +1,129 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.tweetypie.storage_internal.thriftscala._
|
||||
import com.twitter.tbird.{thriftscala => tbird}
|
||||
|
||||
object StatusConversions {
|
||||
|
||||
/**
|
||||
* This is used only in Scribe.scala, when scribing to tbird_add_status
|
||||
* Once we remove that, we can also remove this.
|
||||
*/
|
||||
def toTBirdStatus(tweet: StoredTweet): tbird.Status =
|
||||
tbird.Status(
|
||||
id = tweet.id,
|
||||
userId = tweet.userId.get,
|
||||
text = tweet.text.get,
|
||||
createdVia = tweet.createdVia.get,
|
||||
createdAtSec = tweet.createdAtSec.get,
|
||||
reply = tweet.reply.map(toTBirdReply),
|
||||
share = tweet.share.map(toTBirdShare),
|
||||
contributorId = tweet.contributorId,
|
||||
geo = tweet.geo.map(toTBirdGeo),
|
||||
hasTakedown = tweet.hasTakedown.getOrElse(false),
|
||||
nsfwUser = tweet.nsfwUser.getOrElse(false),
|
||||
nsfwAdmin = tweet.nsfwAdmin.getOrElse(false),
|
||||
media = tweet.media.map(_.map(toTBirdMedia)).getOrElse(Seq()),
|
||||
narrowcast = tweet.narrowcast.map(toTBirdNarrowcast),
|
||||
nullcast = tweet.nullcast.getOrElse(false),
|
||||
trackingId = tweet.trackingId
|
||||
)
|
||||
|
||||
/**
|
||||
* This is only used in a test, to verify that the above method `toTBirdStatus`
|
||||
* works, so we can't remove it as long as the above method exists.
|
||||
*/
|
||||
def fromTBirdStatus(status: tbird.Status): StoredTweet = {
|
||||
StoredTweet(
|
||||
id = status.id,
|
||||
userId = Some(status.userId),
|
||||
text = Some(status.text),
|
||||
createdVia = Some(status.createdVia),
|
||||
createdAtSec = Some(status.createdAtSec),
|
||||
reply = status.reply.map(fromTBirdReply),
|
||||
share = status.share.map(fromTBirdShare),
|
||||
contributorId = status.contributorId,
|
||||
geo = status.geo.map(fromTBirdGeo),
|
||||
hasTakedown = Some(status.hasTakedown),
|
||||
nsfwUser = Some(status.nsfwUser),
|
||||
nsfwAdmin = Some(status.nsfwAdmin),
|
||||
media = Some(status.media.map(fromTBirdMedia)),
|
||||
narrowcast = status.narrowcast.map(fromTBirdNarrowcast),
|
||||
nullcast = Some(status.nullcast),
|
||||
trackingId = status.trackingId
|
||||
)
|
||||
}
|
||||
|
||||
private def fromTBirdReply(reply: tbird.Reply): StoredReply =
|
||||
StoredReply(
|
||||
inReplyToStatusId = reply.inReplyToStatusId,
|
||||
inReplyToUserId = reply.inReplyToUserId
|
||||
)
|
||||
|
||||
private def fromTBirdShare(share: tbird.Share): StoredShare =
|
||||
StoredShare(
|
||||
sourceStatusId = share.sourceStatusId,
|
||||
sourceUserId = share.sourceUserId,
|
||||
parentStatusId = share.parentStatusId
|
||||
)
|
||||
|
||||
private def fromTBirdGeo(geo: tbird.Geo): StoredGeo =
|
||||
StoredGeo(
|
||||
latitude = geo.latitude,
|
||||
longitude = geo.longitude,
|
||||
geoPrecision = geo.geoPrecision,
|
||||
entityId = geo.entityId
|
||||
)
|
||||
|
||||
private def fromTBirdMedia(media: tbird.MediaEntity): StoredMediaEntity =
|
||||
StoredMediaEntity(
|
||||
id = media.id,
|
||||
mediaType = media.mediaType,
|
||||
width = media.width,
|
||||
height = media.height
|
||||
)
|
||||
|
||||
private def fromTBirdNarrowcast(narrowcast: tbird.Narrowcast): StoredNarrowcast =
|
||||
StoredNarrowcast(
|
||||
language = Some(narrowcast.language),
|
||||
location = Some(narrowcast.location),
|
||||
ids = Some(narrowcast.ids)
|
||||
)
|
||||
|
||||
private def toTBirdReply(reply: StoredReply): tbird.Reply =
|
||||
tbird.Reply(
|
||||
inReplyToStatusId = reply.inReplyToStatusId,
|
||||
inReplyToUserId = reply.inReplyToUserId
|
||||
)
|
||||
|
||||
private def toTBirdShare(share: StoredShare): tbird.Share =
|
||||
tbird.Share(
|
||||
sourceStatusId = share.sourceStatusId,
|
||||
sourceUserId = share.sourceUserId,
|
||||
parentStatusId = share.parentStatusId
|
||||
)
|
||||
|
||||
private def toTBirdGeo(geo: StoredGeo): tbird.Geo =
|
||||
tbird.Geo(
|
||||
latitude = geo.latitude,
|
||||
longitude = geo.longitude,
|
||||
geoPrecision = geo.geoPrecision,
|
||||
entityId = geo.entityId,
|
||||
name = geo.name
|
||||
)
|
||||
|
||||
private def toTBirdMedia(media: StoredMediaEntity): tbird.MediaEntity =
|
||||
tbird.MediaEntity(
|
||||
id = media.id,
|
||||
mediaType = media.mediaType,
|
||||
width = media.width,
|
||||
height = media.height
|
||||
)
|
||||
|
||||
private def toTBirdNarrowcast(narrowcast: StoredNarrowcast): tbird.Narrowcast =
|
||||
tbird.Narrowcast(
|
||||
language = narrowcast.language.getOrElse(Nil),
|
||||
location = narrowcast.location.getOrElse(Nil),
|
||||
ids = narrowcast.ids.getOrElse(Nil)
|
||||
)
|
||||
}
|
|
@ -0,0 +1,346 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.mediaservices.commons.tweetmedia.thriftscala._
|
||||
import com.twitter.scrooge.TFieldBlob
|
||||
import com.twitter.tweetypie.additionalfields.AdditionalFields
|
||||
import com.twitter.tweetypie.storage_internal.thriftscala._
|
||||
import com.twitter.tweetypie.thriftscala._
|
||||
import com.twitter.tweetypie.util.TweetLenses
|
||||
|
||||
object StorageConversions {
|
||||
private val tbTweetCompiledAdditionalFieldIds =
|
||||
StoredTweet.metaData.fields.map(_.id).filter(AdditionalFields.isAdditionalFieldId)
|
||||
|
||||
def toStoredReply(reply: Reply, conversationId: Option[TweetId]): StoredReply =
|
||||
StoredReply(
|
||||
inReplyToStatusId = reply.inReplyToStatusId.getOrElse(0),
|
||||
inReplyToUserId = reply.inReplyToUserId,
|
||||
conversationId = conversationId
|
||||
)
|
||||
|
||||
def toStoredShare(share: Share): StoredShare =
|
||||
StoredShare(
|
||||
share.sourceStatusId,
|
||||
share.sourceUserId,
|
||||
share.parentStatusId
|
||||
)
|
||||
|
||||
def toStoredQuotedTweet(qt: QuotedTweet, text: String): Option[StoredQuotedTweet] =
|
||||
qt.permalink
|
||||
.filterNot { p =>
|
||||
text.contains(p.shortUrl)
|
||||
} // omit StoredQuotedTweet when url already in text
|
||||
.map { p =>
|
||||
StoredQuotedTweet(
|
||||
qt.tweetId,
|
||||
qt.userId,
|
||||
p.shortUrl
|
||||
)
|
||||
}
|
||||
|
||||
def toStoredGeo(tweet: Tweet): Option[StoredGeo] =
|
||||
TweetLenses.geoCoordinates.get(tweet) match {
|
||||
case None =>
|
||||
TweetLenses.placeId.get(tweet) match {
|
||||
case None => None
|
||||
case Some(placeId) =>
|
||||
Some(
|
||||
StoredGeo(
|
||||
latitude = 0.0,
|
||||
longitude = 0.0,
|
||||
geoPrecision = 0,
|
||||
entityId = 0,
|
||||
name = Some(placeId)
|
||||
)
|
||||
)
|
||||
}
|
||||
case Some(coords) =>
|
||||
Some(
|
||||
StoredGeo(
|
||||
latitude = coords.latitude,
|
||||
longitude = coords.longitude,
|
||||
geoPrecision = coords.geoPrecision,
|
||||
entityId = if (coords.display) 2 else 0,
|
||||
name = TweetLenses.placeId.get(tweet)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
def toStoredMedia(mediaList: Seq[MediaEntity]): Seq[StoredMediaEntity] =
|
||||
mediaList.filter(_.sourceStatusId.isEmpty).flatMap(toStoredMediaEntity)
|
||||
|
||||
def toStoredMediaEntity(media: MediaEntity): Option[StoredMediaEntity] =
|
||||
media.sizes.find(_.sizeType == MediaSizeType.Orig).map { origSize =>
|
||||
StoredMediaEntity(
|
||||
id = media.mediaId,
|
||||
mediaType = origSize.deprecatedContentType.value.toByte,
|
||||
width = origSize.width.toShort,
|
||||
height = origSize.height.toShort
|
||||
)
|
||||
}
|
||||
|
||||
// The language and ids fields are for compatibility with existing tweets stored in manhattan.
|
||||
def toStoredNarrowcast(narrowcast: Narrowcast): StoredNarrowcast =
|
||||
StoredNarrowcast(
|
||||
language = Some(Seq.empty),
|
||||
location = Some(narrowcast.location),
|
||||
ids = Some(Seq.empty)
|
||||
)
|
||||
|
||||
def toStoredAdditionalFields(from: Seq[TFieldBlob], to: StoredTweet): StoredTweet =
|
||||
from.foldLeft(to) { case (t, f) => t.setField(f) }
|
||||
|
||||
def toStoredAdditionalFields(from: Tweet, to: StoredTweet): StoredTweet =
|
||||
toStoredAdditionalFields(AdditionalFields.additionalFields(from), to)
|
||||
|
||||
def toStoredTweet(tweet: Tweet): StoredTweet = {
|
||||
val storedTweet =
|
||||
StoredTweet(
|
||||
id = tweet.id,
|
||||
userId = Some(TweetLenses.userId(tweet)),
|
||||
text = Some(TweetLenses.text(tweet)),
|
||||
createdVia = Some(TweetLenses.createdVia(tweet)),
|
||||
createdAtSec = Some(TweetLenses.createdAt(tweet)),
|
||||
reply =
|
||||
TweetLenses.reply(tweet).map { r => toStoredReply(r, TweetLenses.conversationId(tweet)) },
|
||||
share = TweetLenses.share(tweet).map(toStoredShare),
|
||||
contributorId = tweet.contributor.map(_.userId),
|
||||
geo = toStoredGeo(tweet),
|
||||
hasTakedown = Some(TweetLenses.hasTakedown(tweet)),
|
||||
nsfwUser = Some(TweetLenses.nsfwUser(tweet)),
|
||||
nsfwAdmin = Some(TweetLenses.nsfwAdmin(tweet)),
|
||||
media = tweet.media.map(toStoredMedia),
|
||||
narrowcast = TweetLenses.narrowcast(tweet).map(toStoredNarrowcast),
|
||||
nullcast = Some(TweetLenses.nullcast(tweet)),
|
||||
trackingId = TweetLenses.trackingId(tweet),
|
||||
quotedTweet = TweetLenses.quotedTweet(tweet).flatMap { qt =>
|
||||
toStoredQuotedTweet(qt, TweetLenses.text(tweet))
|
||||
}
|
||||
)
|
||||
toStoredAdditionalFields(tweet, storedTweet)
|
||||
}
|
||||
|
||||
/**
|
||||
* Does not need core data to be set. Constructs on disk tweet by avoiding the TweetLenses object
|
||||
* and only extracting the specified fields.
|
||||
*
|
||||
* NOTE: Assumes that specified fields are set in the tweet.
|
||||
*
|
||||
* @param tpTweet Tweetypie Tweet to be converted
|
||||
* @param fields the fields to be populated in the on disk Tweet
|
||||
*
|
||||
* @return an on disk Tweet which has only the specified fields set
|
||||
*/
|
||||
def toStoredTweetForFields(tpTweet: Tweet, fields: Set[Field]): StoredTweet = {
|
||||
|
||||
// Make sure all the passed in fields are known or additional fields
|
||||
require(
|
||||
(fields -- Field.AllUpdatableCompiledFields)
|
||||
.forall(field => AdditionalFields.isAdditionalFieldId(field.id))
|
||||
)
|
||||
|
||||
val storedTweet =
|
||||
StoredTweet(
|
||||
id = tpTweet.id,
|
||||
geo = if (fields.contains(Field.Geo)) {
|
||||
tpTweet.coreData.get.coordinates match {
|
||||
case None =>
|
||||
tpTweet.coreData.get.placeId match {
|
||||
case None => None
|
||||
case Some(placeId) =>
|
||||
Some(
|
||||
StoredGeo(
|
||||
latitude = 0.0,
|
||||
longitude = 0.0,
|
||||
geoPrecision = 0,
|
||||
entityId = 0,
|
||||
name = Some(placeId)
|
||||
)
|
||||
)
|
||||
}
|
||||
case Some(coords) =>
|
||||
Some(
|
||||
StoredGeo(
|
||||
latitude = coords.latitude,
|
||||
longitude = coords.longitude,
|
||||
geoPrecision = coords.geoPrecision,
|
||||
entityId = if (coords.display) 2 else 0,
|
||||
name = tpTweet.coreData.get.placeId
|
||||
)
|
||||
)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
},
|
||||
hasTakedown =
|
||||
if (fields.contains(Field.HasTakedown))
|
||||
Some(tpTweet.coreData.get.hasTakedown)
|
||||
else
|
||||
None,
|
||||
nsfwUser =
|
||||
if (fields.contains(Field.NsfwUser))
|
||||
Some(tpTweet.coreData.get.nsfwUser)
|
||||
else
|
||||
None,
|
||||
nsfwAdmin =
|
||||
if (fields.contains(Field.NsfwAdmin))
|
||||
Some(tpTweet.coreData.get.nsfwAdmin)
|
||||
else
|
||||
None
|
||||
)
|
||||
|
||||
if (fields.map(_.id).exists(AdditionalFields.isAdditionalFieldId))
|
||||
toStoredAdditionalFields(tpTweet, storedTweet)
|
||||
else
|
||||
storedTweet
|
||||
}
|
||||
|
||||
def fromStoredReply(reply: StoredReply): Reply =
|
||||
Reply(
|
||||
Some(reply.inReplyToStatusId).filter(_ > 0),
|
||||
reply.inReplyToUserId
|
||||
)
|
||||
|
||||
def fromStoredShare(share: StoredShare): Share =
|
||||
Share(
|
||||
share.sourceStatusId,
|
||||
share.sourceUserId,
|
||||
share.parentStatusId
|
||||
)
|
||||
|
||||
def fromStoredQuotedTweet(qt: StoredQuotedTweet): QuotedTweet =
|
||||
QuotedTweet(
|
||||
qt.tweetId,
|
||||
qt.userId,
|
||||
Some(
|
||||
ShortenedUrl(
|
||||
shortUrl = qt.shortUrl,
|
||||
longUrl = "", // will be hydrated later via tweetypie's QuotedTweetRefUrlsHydrator
|
||||
displayText = "" //will be hydrated later via tweetypie's QuotedTweetRefUrlsHydrator
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
def fromStoredGeo(geo: StoredGeo): GeoCoordinates =
|
||||
GeoCoordinates(
|
||||
latitude = geo.latitude,
|
||||
longitude = geo.longitude,
|
||||
geoPrecision = geo.geoPrecision,
|
||||
display = geo.entityId == 2
|
||||
)
|
||||
|
||||
def fromStoredMediaEntity(media: StoredMediaEntity): MediaEntity =
|
||||
MediaEntity(
|
||||
fromIndex = -1, // will get filled in later
|
||||
toIndex = -1, // will get filled in later
|
||||
url = null, // will get filled in later
|
||||
mediaPath = "", // field is obsolete
|
||||
mediaUrl = null, // will get filled in later
|
||||
mediaUrlHttps = null, // will get filled in later
|
||||
displayUrl = null, // will get filled in later
|
||||
expandedUrl = null, // will get filled in later
|
||||
mediaId = media.id,
|
||||
nsfw = false,
|
||||
sizes = Set(
|
||||
MediaSize(
|
||||
sizeType = MediaSizeType.Orig,
|
||||
resizeMethod = MediaResizeMethod.Fit,
|
||||
deprecatedContentType = MediaContentType(media.mediaType),
|
||||
width = media.width,
|
||||
height = media.height
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
def fromStoredNarrowcast(narrowcast: StoredNarrowcast): Narrowcast =
|
||||
Narrowcast(
|
||||
location = narrowcast.location.getOrElse(Seq())
|
||||
)
|
||||
|
||||
def fromStoredTweet(storedTweet: StoredTweet): Tweet = {
|
||||
val coreData =
|
||||
TweetCoreData(
|
||||
userId = storedTweet.userId.get,
|
||||
text = storedTweet.text.get,
|
||||
createdVia = storedTweet.createdVia.get,
|
||||
createdAtSecs = storedTweet.createdAtSec.get,
|
||||
reply = storedTweet.reply.map(fromStoredReply),
|
||||
share = storedTweet.share.map(fromStoredShare),
|
||||
hasTakedown = storedTweet.hasTakedown.getOrElse(false),
|
||||
nsfwUser = storedTweet.nsfwUser.getOrElse(false),
|
||||
nsfwAdmin = storedTweet.nsfwAdmin.getOrElse(false),
|
||||
narrowcast = storedTweet.narrowcast.map(fromStoredNarrowcast),
|
||||
nullcast = storedTweet.nullcast.getOrElse(false),
|
||||
trackingId = storedTweet.trackingId,
|
||||
conversationId = storedTweet.reply.flatMap(_.conversationId),
|
||||
placeId = storedTweet.geo.flatMap(_.name),
|
||||
coordinates = storedTweet.geo.map(fromStoredGeo),
|
||||
hasMedia = if (storedTweet.media.exists(_.nonEmpty)) Some(true) else None
|
||||
)
|
||||
|
||||
// retweets should never have their media, but some tweets incorrectly do.
|
||||
val storedMedia = if (coreData.share.isDefined) Nil else storedTweet.media.toSeq
|
||||
|
||||
val tpTweet =
|
||||
Tweet(
|
||||
id = storedTweet.id,
|
||||
coreData = Some(coreData),
|
||||
contributor = storedTweet.contributorId.map(Contributor(_)),
|
||||
media = Some(storedMedia.flatten.map(fromStoredMediaEntity)),
|
||||
mentions = Some(Seq.empty),
|
||||
urls = Some(Seq.empty),
|
||||
cashtags = Some(Seq.empty),
|
||||
hashtags = Some(Seq.empty),
|
||||
quotedTweet = storedTweet.quotedTweet.map(fromStoredQuotedTweet)
|
||||
)
|
||||
fromStoredAdditionalFields(storedTweet, tpTweet)
|
||||
}
|
||||
|
||||
def fromStoredTweetAllowInvalid(storedTweet: StoredTweet): Tweet = {
|
||||
fromStoredTweet(
|
||||
storedTweet.copy(
|
||||
userId = storedTweet.userId.orElse(Some(-1L)),
|
||||
text = storedTweet.text.orElse(Some("")),
|
||||
createdVia = storedTweet.createdVia.orElse(Some("")),
|
||||
createdAtSec = storedTweet.createdAtSec.orElse(Some(-1L))
|
||||
))
|
||||
}
|
||||
|
||||
def fromStoredAdditionalFields(from: StoredTweet, to: Tweet): Tweet = {
|
||||
val passThroughAdditionalFields =
|
||||
from._passthroughFields.filterKeys(AdditionalFields.isAdditionalFieldId)
|
||||
val allAdditionalFields =
|
||||
from.getFieldBlobs(tbTweetCompiledAdditionalFieldIds) ++ passThroughAdditionalFields
|
||||
allAdditionalFields.values.foldLeft(to) { case (t, f) => t.setField(f) }
|
||||
}
|
||||
|
||||
def toDeletedTweet(storedTweet: StoredTweet): DeletedTweet = {
|
||||
val noteTweetBlob = storedTweet.getFieldBlob(Tweet.NoteTweetField.id)
|
||||
val noteTweetOption = noteTweetBlob.map(blob => NoteTweet.decode(blob.read))
|
||||
DeletedTweet(
|
||||
id = storedTweet.id,
|
||||
userId = storedTweet.userId,
|
||||
text = storedTweet.text,
|
||||
createdAtSecs = storedTweet.createdAtSec,
|
||||
share = storedTweet.share.map(toDeletedShare),
|
||||
media = storedTweet.media.map(_.map(toDeletedMediaEntity)),
|
||||
noteTweetId = noteTweetOption.map(_.id),
|
||||
isExpandable = noteTweetOption.flatMap(_.isExpandable)
|
||||
)
|
||||
}
|
||||
|
||||
def toDeletedShare(storedShare: StoredShare): DeletedTweetShare =
|
||||
DeletedTweetShare(
|
||||
sourceStatusId = storedShare.sourceStatusId,
|
||||
sourceUserId = storedShare.sourceUserId,
|
||||
parentStatusId = storedShare.parentStatusId
|
||||
)
|
||||
|
||||
def toDeletedMediaEntity(storedMediaEntity: StoredMediaEntity): DeletedTweetMediaEntity =
|
||||
DeletedTweetMediaEntity(
|
||||
id = storedMediaEntity.id,
|
||||
mediaType = storedMediaEntity.mediaType,
|
||||
width = storedMediaEntity.width,
|
||||
height = storedMediaEntity.height
|
||||
)
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.util.Return
|
||||
import com.twitter.util.Throw
|
||||
import com.twitter.util.Time
|
||||
import com.twitter.util.Try
|
||||
import java.util.Arrays
|
||||
import scala.util.control.NoStackTrace
|
||||
import scala.util.control.NonFatal
|
||||
|
||||
sealed abstract class TimestampType(val keyName: String)
|
||||
object TimestampType {
|
||||
object Default extends TimestampType("timestamp")
|
||||
object SoftDelete extends TimestampType("softdelete_timestamp")
|
||||
}
|
||||
|
||||
/**
|
||||
* TimestampDecoder gets the timestamps associated with state records. The Manhattan timestamp is
|
||||
* used for legacy records (with value "1"), otherwise the timestamp is extracted from the
|
||||
* JSON value.
|
||||
*
|
||||
* See "Metadata" in README.md for further information about state records.
|
||||
*/
|
||||
object TimestampDecoder {
|
||||
case class UnparsableJson(msg: String, t: Throwable) extends Exception(msg, t) with NoStackTrace
|
||||
case class MissingJsonTimestamp(msg: String) extends Exception(msg) with NoStackTrace
|
||||
case class UnexpectedJsonValue(msg: String) extends Exception(msg) with NoStackTrace
|
||||
case class MissingManhattanTimestamp(msg: String) extends Exception(msg) with NoStackTrace
|
||||
|
||||
private[storage] val LegacyValue: Array[Byte] = Array('1')
|
||||
|
||||
/**
|
||||
* The first backfill of tweet data to Manhattan supplied timestamps in milliseconds where
|
||||
* nanoseconds were expected. The result is that some values have an incorrect Manhattan
|
||||
* timestamp. For these bad timestamps, time.inNanoseconds is actually milliseconds.
|
||||
*
|
||||
* For example, the deletion record for tweet 22225781 has Manhattan timestamp 1970-01-01 00:23:24 +0000.
|
||||
* Contrast with the deletion record for tweet 435404491999813632 with Manhattan timestamp 2014-11-09 14:24:04 +0000
|
||||
*
|
||||
* This threshold value comes from the last time in milliseconds that was interpreted
|
||||
* as nanoseconds, e.g. Time.fromNanoseconds(1438387200000L) == 1970-01-01 00:23:58 +0000
|
||||
*/
|
||||
private[storage] val BadTimestampThreshold = Time.at("1970-01-01 00:23:58 +0000")
|
||||
|
||||
def decode(record: TweetManhattanRecord, tsType: TimestampType): Try[Long] =
|
||||
decode(record.value, tsType)
|
||||
|
||||
def decode(mhValue: TweetManhattanValue, tsType: TimestampType): Try[Long] = {
|
||||
val value = ByteArrayCodec.fromByteBuffer(mhValue.contents)
|
||||
if (isLegacyRecord(value)) {
|
||||
nativeManhattanTimestamp(mhValue)
|
||||
} else {
|
||||
jsonTimestamp(value, tsType)
|
||||
}
|
||||
}
|
||||
|
||||
private def isLegacyRecord(value: Array[Byte]) = Arrays.equals(value, LegacyValue)
|
||||
|
||||
private def nativeManhattanTimestamp(mhValue: TweetManhattanValue): Try[Long] =
|
||||
mhValue.timestamp match {
|
||||
case Some(ts) => Return(correctedTimestamp(ts))
|
||||
case None =>
|
||||
Throw(MissingManhattanTimestamp(s"Manhattan timestamp missing in value $mhValue"))
|
||||
}
|
||||
|
||||
private def jsonTimestamp(value: Array[Byte], tsType: TimestampType): Try[Long] =
|
||||
Try { Json.decode(value) }
|
||||
.rescue { case NonFatal(e) => Throw(UnparsableJson(e.getMessage, e)) }
|
||||
.flatMap { m =>
|
||||
m.get(tsType.keyName) match {
|
||||
case Some(v) =>
|
||||
v match {
|
||||
case l: Long => Return(l)
|
||||
case i: Integer => Return(i.toLong)
|
||||
case _ =>
|
||||
Throw(
|
||||
UnexpectedJsonValue(s"Unexpected value for ${tsType.keyName} in record data $m")
|
||||
)
|
||||
}
|
||||
case None =>
|
||||
Throw(MissingJsonTimestamp(s"Missing key ${tsType.keyName} in record data $m"))
|
||||
}
|
||||
}
|
||||
|
||||
def correctedTime(t: Time): Time =
|
||||
if (t < BadTimestampThreshold) Time.fromMilliseconds(t.inNanoseconds) else t
|
||||
|
||||
def correctedTime(t: Long): Time = correctedTime(Time.fromNanoseconds(t))
|
||||
|
||||
def correctedTimestamp(t: Time): Long =
|
||||
if (t < BadTimestampThreshold) t.inNanoseconds else t.inMilliseconds
|
||||
}
|
|
@ -0,0 +1,164 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
/**
|
||||
* Responsible for encoding/decoding Tweet records to/from Manhattan keys
|
||||
*
|
||||
* K/V Scheme:
|
||||
* -----------
|
||||
* [TweetId]
|
||||
* /metadata
|
||||
* /delete_state (a.k.a. hard delete)
|
||||
* /soft_delete_state
|
||||
* /bounce_delete_state
|
||||
* /undelete_state
|
||||
* /force_added_state
|
||||
* /scrubbed_fields/
|
||||
* /[ScrubbedFieldId_1]
|
||||
* ..
|
||||
* /[ScrubbedFieldId_M]
|
||||
* /fields
|
||||
* /internal
|
||||
* /1
|
||||
* /9
|
||||
* ..
|
||||
* /99
|
||||
* /external
|
||||
* /100
|
||||
* ..
|
||||
*
|
||||
* IMPORTANT NOTE:
|
||||
* 1) Field Ids 2 to 8 in Tweet thrift struct are considered "core fields" are 'packed' together
|
||||
* into a TFieldBlob and stored under field id 1 (i.e [DatasetName]/[TweetId]/fields/internal/1).
|
||||
* This is why we do not see keys from [DatasetName]/[TweetId]/fields/internal/2 to [DatasetName]/
|
||||
* [TweetId]/fields/internal/8)
|
||||
*
|
||||
* 2) Also, the tweet id (which is the field id 1 in Tweet thrift structure) is not explicitly stored
|
||||
* in Manhattan. There is no need to explicitly store it since it is a part of the Pkey
|
||||
*/
|
||||
case class TweetKey(tweetId: TweetId, lKey: TweetKey.LKey) {
|
||||
override def toString: String =
|
||||
s"/${ManhattanOperations.PkeyInjection(tweetId)}/${ManhattanOperations.LkeyInjection(lKey)}"
|
||||
}
|
||||
|
||||
object TweetKey {
|
||||
// Manhattan uses lexicographical order for keys. To make sure lexicographical order matches the
|
||||
// numerical order, we should pad both tweet id and field ids with leading zeros.
|
||||
// Since tweet id is long and field id is a short, the max width of each can be obtained by doing
|
||||
// Long.MaxValue.toString.length and Short.MaxValue.toString.length respectively
|
||||
private val TweetIdFormatStr = s"%0${Long.MaxValue.toString.length}d"
|
||||
private val FieldIdFormatStr = s"%0${Short.MaxValue.toString.length}d"
|
||||
private[storage] def padTweetIdStr(tweetId: Long): String = TweetIdFormatStr.format(tweetId)
|
||||
private[storage] def padFieldIdStr(fieldId: Short): String = FieldIdFormatStr.format(fieldId)
|
||||
|
||||
def coreFieldsKey(tweetId: TweetId): TweetKey = TweetKey(tweetId, LKey.CoreFieldsKey)
|
||||
def hardDeletionStateKey(tweetId: TweetId): TweetKey =
|
||||
TweetKey(tweetId, LKey.HardDeletionStateKey)
|
||||
def softDeletionStateKey(tweetId: TweetId): TweetKey =
|
||||
TweetKey(tweetId, LKey.SoftDeletionStateKey)
|
||||
def bounceDeletionStateKey(tweetId: TweetId): TweetKey =
|
||||
TweetKey(tweetId, LKey.BounceDeletionStateKey)
|
||||
def unDeletionStateKey(tweetId: TweetId): TweetKey = TweetKey(tweetId, LKey.UnDeletionStateKey)
|
||||
def forceAddedStateKey(tweetId: TweetId): TweetKey = TweetKey(tweetId, LKey.ForceAddedStateKey)
|
||||
def scrubbedGeoFieldKey(tweetId: TweetId): TweetKey = TweetKey(tweetId, LKey.ScrubbedGeoFieldKey)
|
||||
def fieldKey(tweetId: TweetId, fieldId: FieldId): TweetKey =
|
||||
TweetKey(tweetId, LKey.FieldKey(fieldId))
|
||||
def internalFieldsKey(tweetId: TweetId, fieldId: FieldId): TweetKey =
|
||||
TweetKey(tweetId, LKey.InternalFieldsKey(fieldId))
|
||||
def additionalFieldsKey(tweetId: TweetId, fieldId: FieldId): TweetKey =
|
||||
TweetKey(tweetId, LKey.AdditionalFieldsKey(fieldId))
|
||||
def scrubbedFieldKey(tweetId: TweetId, fieldId: FieldId): TweetKey =
|
||||
TweetKey(tweetId, LKey.ScrubbedFieldKey(fieldId))
|
||||
|
||||
// AllFieldsKeyPrefix: fields
|
||||
// CoreFieldsKey: fields/internal/1 (Stores subset of StoredTweet fields which are
|
||||
// "packed" into a single CoreFields record)
|
||||
// HardDeletionStateKey: metadata/delete_state
|
||||
// SoftDeletionStateKey: metadata/soft_delete_state
|
||||
// BounceDeletionStateKey: metadata/bounce_delete_state
|
||||
// UnDeletionStateKey: metadata/undelete_state
|
||||
// ForceAddedStateKey: metadata/force_added_state
|
||||
// FieldKey: fields/<group_name>/<padded_field_id> (where <group_name>
|
||||
// is 'internal' for field ids < 100 and 'external' for all other
|
||||
// fields ids)
|
||||
// InternalFieldsKeyPrefix: fields/internal
|
||||
// PKey: <empty string>
|
||||
// ScrubbedFieldKey: metadata/scrubbed_fields/<padded_field_id>
|
||||
// ScrubbedFieldKeyPrefix: metadata/scrubbed_fields
|
||||
sealed abstract class LKey(override val toString: String)
|
||||
object LKey {
|
||||
private val HardDeletionRecordLiteral = "delete_state"
|
||||
private val SoftDeletionRecordLiteral = "soft_delete_state"
|
||||
private val BounceDeletionRecordLiteral = "bounce_delete_state"
|
||||
private val UnDeletionRecordLiteral = "undelete_state"
|
||||
private val ForceAddRecordLiteral = "force_added_state"
|
||||
private val ScrubbedFieldsGroup = "scrubbed_fields"
|
||||
private val InternalFieldsGroup = "internal"
|
||||
private val ExternalFieldsGroup = "external"
|
||||
private val MetadataCategory = "metadata"
|
||||
private val FieldsCategory = "fields"
|
||||
private val InternalFieldsKeyPrefix = s"$FieldsCategory/$InternalFieldsGroup/"
|
||||
private val ExternalFieldsKeyPrefix = s"$FieldsCategory/$ExternalFieldsGroup/"
|
||||
private val ScrubbedFieldsKeyPrefix = s"$MetadataCategory/$ScrubbedFieldsGroup/"
|
||||
|
||||
sealed abstract class MetadataKey(metadataType: String)
|
||||
extends LKey(s"$MetadataCategory/$metadataType")
|
||||
sealed abstract class StateKey(stateType: String) extends MetadataKey(stateType)
|
||||
case object HardDeletionStateKey extends StateKey(s"$HardDeletionRecordLiteral")
|
||||
case object SoftDeletionStateKey extends StateKey(s"$SoftDeletionRecordLiteral")
|
||||
case object BounceDeletionStateKey extends StateKey(s"$BounceDeletionRecordLiteral")
|
||||
case object UnDeletionStateKey extends StateKey(s"$UnDeletionRecordLiteral")
|
||||
case object ForceAddedStateKey extends StateKey(s"$ForceAddRecordLiteral")
|
||||
|
||||
case class ScrubbedFieldKey(fieldId: FieldId)
|
||||
extends MetadataKey(s"$ScrubbedFieldsGroup/${padFieldIdStr(fieldId)}")
|
||||
val ScrubbedGeoFieldKey: LKey.ScrubbedFieldKey = ScrubbedFieldKey(TweetFields.geoFieldId)
|
||||
|
||||
/**
|
||||
* LKey that has one of many possible fields id. This generalize over
|
||||
* internal and additional fields key.
|
||||
*/
|
||||
sealed abstract class FieldKey(prefix: String) extends LKey(toString) {
|
||||
def fieldId: FieldId
|
||||
override val toString: String = prefix + padFieldIdStr(fieldId)
|
||||
}
|
||||
object FieldKey {
|
||||
def apply(fieldId: FieldId): FieldKey =
|
||||
fieldId match {
|
||||
case id if id < TweetFields.firstAdditionalFieldId => InternalFieldsKey(fieldId)
|
||||
case _ => AdditionalFieldsKey(fieldId)
|
||||
}
|
||||
}
|
||||
|
||||
case class InternalFieldsKey(fieldId: FieldId) extends FieldKey(InternalFieldsKeyPrefix) {
|
||||
assert(fieldId < TweetFields.firstAdditionalFieldId)
|
||||
}
|
||||
case class AdditionalFieldsKey(fieldId: FieldId) extends FieldKey(ExternalFieldsKeyPrefix) {
|
||||
assert(fieldId >= TweetFields.firstAdditionalFieldId)
|
||||
}
|
||||
val CoreFieldsKey: LKey.InternalFieldsKey = InternalFieldsKey(TweetFields.rootCoreFieldId)
|
||||
|
||||
case class Unknown private (str: String) extends LKey(str)
|
||||
|
||||
def fromString(str: String): LKey = {
|
||||
def extractFieldId(prefix: String): FieldId =
|
||||
str.slice(prefix.length, str.length).toShort
|
||||
|
||||
str match {
|
||||
case CoreFieldsKey.toString => CoreFieldsKey
|
||||
case HardDeletionStateKey.toString => HardDeletionStateKey
|
||||
case SoftDeletionStateKey.toString => SoftDeletionStateKey
|
||||
case BounceDeletionStateKey.toString => BounceDeletionStateKey
|
||||
case UnDeletionStateKey.toString => UnDeletionStateKey
|
||||
case ForceAddedStateKey.toString => ForceAddedStateKey
|
||||
case ScrubbedGeoFieldKey.toString => ScrubbedGeoFieldKey
|
||||
case _ if str.startsWith(InternalFieldsKeyPrefix) =>
|
||||
InternalFieldsKey(extractFieldId(InternalFieldsKeyPrefix))
|
||||
case _ if str.startsWith(ExternalFieldsKeyPrefix) =>
|
||||
AdditionalFieldsKey(extractFieldId(ExternalFieldsKeyPrefix))
|
||||
case _ if str.startsWith(ScrubbedFieldsKeyPrefix) =>
|
||||
ScrubbedFieldKey(extractFieldId(ScrubbedFieldsKeyPrefix))
|
||||
case _ => Unknown(str)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,90 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.storage.client.manhattan.kv.ManhattanValue
|
||||
import com.twitter.util.Time
|
||||
|
||||
/**
|
||||
* A [[TweetStateRecord]] represents an action taken on a tweet and can be used to determine a tweet's state.
|
||||
*
|
||||
* The state is determined by the record with the most recent timestamp. In the absence of any
|
||||
* record a tweet is considered found, which is to say the tweet has not been through the
|
||||
* deletion process.
|
||||
*
|
||||
* The [[TweetStateRecord]] type is determined by the lkey of a tweet manhattan record:
|
||||
* metadata/delete_state -> HardDeleted
|
||||
* metadata/soft_delete_state -> SoftDeleted
|
||||
* metadata/undelete_state -> Undeleted
|
||||
* metadata/force_added_state -> ForceAdded
|
||||
*
|
||||
* See the README in this directory for more details about the state of a tweet.
|
||||
*/
|
||||
sealed trait TweetStateRecord {
|
||||
def tweetId: TweetId
|
||||
def createdAt: Long
|
||||
def stateKey: TweetKey.LKey.StateKey
|
||||
def values: Map[String, Long] = Map("timestamp" -> createdAt)
|
||||
def name: String
|
||||
|
||||
def toTweetMhRecord: TweetManhattanRecord = {
|
||||
val valByteBuffer = ByteArrayCodec.toByteBuffer(Json.encode(values))
|
||||
val value = ManhattanValue(valByteBuffer, Some(Time.fromMilliseconds(createdAt)))
|
||||
TweetManhattanRecord(TweetKey(tweetId, stateKey), value)
|
||||
}
|
||||
}
|
||||
|
||||
object TweetStateRecord {
|
||||
|
||||
/** When a soft-deleted or bounce deleted tweet is ultimately hard-deleted by an offline job. */
|
||||
case class HardDeleted(tweetId: TweetId, createdAt: Long, deletedAt: Long)
|
||||
extends TweetStateRecord {
|
||||
// timestamp in the mh backend is the hard deletion timestamp
|
||||
override def values = Map("timestamp" -> createdAt, "softdelete_timestamp" -> deletedAt)
|
||||
def stateKey = TweetKey.LKey.HardDeletionStateKey
|
||||
def name = "hard_deleted"
|
||||
}
|
||||
|
||||
/** When a tweet is deleted by the user. It can still be undeleted while in the soft deleted state. */
|
||||
case class SoftDeleted(tweetId: TweetId, createdAt: Long) extends TweetStateRecord {
|
||||
def stateKey = TweetKey.LKey.SoftDeletionStateKey
|
||||
def name = "soft_deleted"
|
||||
}
|
||||
|
||||
/** When a tweet is deleted by go/bouncer for violating Twitter Rules. It MAY NOT be undeleted. */
|
||||
case class BounceDeleted(tweetId: TweetId, createdAt: Long) extends TweetStateRecord {
|
||||
def stateKey = TweetKey.LKey.BounceDeletionStateKey
|
||||
def name = "bounce_deleted"
|
||||
}
|
||||
|
||||
/** When a tweet is undeleted by an internal system. */
|
||||
case class Undeleted(tweetId: TweetId, createdAt: Long) extends TweetStateRecord {
|
||||
def stateKey = TweetKey.LKey.UnDeletionStateKey
|
||||
def name = "undeleted"
|
||||
}
|
||||
|
||||
/** When a tweet is created using the forceAdd endpoint. */
|
||||
case class ForceAdded(tweetId: TweetId, createdAt: Long) extends TweetStateRecord {
|
||||
def stateKey = TweetKey.LKey.ForceAddedStateKey
|
||||
def name = "force_added"
|
||||
}
|
||||
|
||||
def fromTweetMhRecord(record: TweetManhattanRecord): Option[TweetStateRecord] = {
|
||||
def ts = TimestampDecoder.decode(record, TimestampType.Default).getOrElse(0L)
|
||||
def sdts = TimestampDecoder.decode(record, TimestampType.SoftDelete).getOrElse(0L)
|
||||
def tweetId = record.pkey
|
||||
|
||||
record.lkey match {
|
||||
case TweetKey.LKey.HardDeletionStateKey => Some(HardDeleted(tweetId, ts, sdts))
|
||||
case TweetKey.LKey.SoftDeletionStateKey => Some(SoftDeleted(tweetId, ts))
|
||||
case TweetKey.LKey.BounceDeletionStateKey => Some(BounceDeleted(tweetId, ts))
|
||||
case TweetKey.LKey.UnDeletionStateKey => Some(Undeleted(tweetId, ts))
|
||||
case TweetKey.LKey.ForceAddedStateKey => Some(ForceAdded(tweetId, ts))
|
||||
case _ => None
|
||||
}
|
||||
}
|
||||
|
||||
def fromTweetMhRecords(records: Seq[TweetManhattanRecord]): Seq[TweetStateRecord] =
|
||||
records.flatMap(fromTweetMhRecord)
|
||||
|
||||
def mostRecent(records: Seq[TweetManhattanRecord]): Option[TweetStateRecord] =
|
||||
fromTweetMhRecords(records).sortBy(_.createdAt).lastOption
|
||||
}
|
|
@ -0,0 +1,201 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.stitch.Stitch
|
||||
import com.twitter.tweetypie.storage.Response.TweetResponse
|
||||
import com.twitter.tweetypie.thriftscala.Tweet
|
||||
import com.twitter.util.Future
|
||||
|
||||
/**
|
||||
* Interface for reading and writing tweet data in Manhattan
|
||||
*/
|
||||
trait TweetStorageClient {
|
||||
import TweetStorageClient._
|
||||
def addTweet: AddTweet
|
||||
def deleteAdditionalFields: DeleteAdditionalFields
|
||||
def getTweet: GetTweet
|
||||
def getStoredTweet: GetStoredTweet
|
||||
def getDeletedTweets: GetDeletedTweets
|
||||
def undelete: Undelete
|
||||
def updateTweet: UpdateTweet
|
||||
def scrub: Scrub
|
||||
def softDelete: SoftDelete
|
||||
def bounceDelete: BounceDelete
|
||||
def hardDeleteTweet: HardDeleteTweet
|
||||
def ping: Ping
|
||||
}
|
||||
|
||||
object TweetStorageClient {
|
||||
type GetTweet = TweetId => Stitch[GetTweet.Response]
|
||||
|
||||
object GetTweet {
|
||||
sealed trait Response
|
||||
object Response {
|
||||
case class Found(tweet: Tweet) extends Response
|
||||
object NotFound extends Response
|
||||
object Deleted extends Response
|
||||
// On BounceDeleted, provide the full Tweet so that implementations
|
||||
// (i.e. ManhattanTweetStorageClient) don't not need to be aware of the specific tweet
|
||||
// fields required by callers for proper processing of bounced deleted tweets.
|
||||
case class BounceDeleted(tweet: Tweet) extends Response
|
||||
}
|
||||
}
|
||||
|
||||
type GetStoredTweet = TweetId => Stitch[GetStoredTweet.Response]
|
||||
|
||||
object GetStoredTweet {
|
||||
sealed abstract class Error(val message: String) {
|
||||
override def toString: String = message
|
||||
}
|
||||
object Error {
|
||||
case object TweetIsCorrupt extends Error("stored tweet data is corrupt and cannot be decoded")
|
||||
|
||||
case object ScrubbedFieldsPresent
|
||||
extends Error("stored tweet fields that should be scrubbed are still present")
|
||||
|
||||
case object TweetFieldsMissingOrInvalid
|
||||
extends Error("expected tweet fields are missing or contain invalid values")
|
||||
|
||||
case object TweetShouldBeHardDeleted
|
||||
extends Error("stored tweet that should be hard deleted is still present")
|
||||
}
|
||||
|
||||
sealed trait Response
|
||||
object Response {
|
||||
sealed trait StoredTweetMetadata {
|
||||
def state: Option[TweetStateRecord]
|
||||
def allStates: Seq[TweetStateRecord]
|
||||
def scrubbedFields: Set[FieldId]
|
||||
}
|
||||
|
||||
sealed trait StoredTweetErrors {
|
||||
def errs: Seq[Error]
|
||||
}
|
||||
|
||||
/**
|
||||
* Tweet data was found, possibly state records and/or scrubbed field records.
|
||||
*/
|
||||
sealed trait FoundAny extends Response with StoredTweetMetadata {
|
||||
def tweet: Tweet
|
||||
}
|
||||
|
||||
object FoundAny {
|
||||
def unapply(
|
||||
response: Response
|
||||
): Option[
|
||||
(Tweet, Option[TweetStateRecord], Seq[TweetStateRecord], Set[FieldId], Seq[Error])
|
||||
] =
|
||||
response match {
|
||||
case f: FoundWithErrors =>
|
||||
Some((f.tweet, f.state, f.allStates, f.scrubbedFields, f.errs))
|
||||
case f: FoundAny => Some((f.tweet, f.state, f.allStates, f.scrubbedFields, Seq.empty))
|
||||
case _ => None
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* No records for this tweet id were found in storage
|
||||
*/
|
||||
case class NotFound(id: TweetId) extends Response
|
||||
|
||||
/**
|
||||
* Data related to the Tweet id was found but could not be loaded successfully. The
|
||||
* errs array contains details of the problems.
|
||||
*/
|
||||
case class Failed(
|
||||
id: TweetId,
|
||||
state: Option[TweetStateRecord],
|
||||
allStates: Seq[TweetStateRecord],
|
||||
scrubbedFields: Set[FieldId],
|
||||
errs: Seq[Error],
|
||||
) extends Response
|
||||
with StoredTweetMetadata
|
||||
with StoredTweetErrors
|
||||
|
||||
/**
|
||||
* No Tweet data was found, and the most recent state record found is HardDeleted
|
||||
*/
|
||||
case class HardDeleted(
|
||||
id: TweetId,
|
||||
state: Option[TweetStateRecord.HardDeleted],
|
||||
allStates: Seq[TweetStateRecord],
|
||||
scrubbedFields: Set[FieldId],
|
||||
) extends Response
|
||||
with StoredTweetMetadata
|
||||
|
||||
/**
|
||||
* Tweet data was found, and the most recent state record found, if any, is not
|
||||
* any form of deletion record.
|
||||
*/
|
||||
case class Found(
|
||||
tweet: Tweet,
|
||||
state: Option[TweetStateRecord],
|
||||
allStates: Seq[TweetStateRecord],
|
||||
scrubbedFields: Set[FieldId],
|
||||
) extends FoundAny
|
||||
|
||||
/**
|
||||
* Tweet data was found, and the most recent state record found indicates deletion.
|
||||
*/
|
||||
case class FoundDeleted(
|
||||
tweet: Tweet,
|
||||
state: Option[TweetStateRecord],
|
||||
allStates: Seq[TweetStateRecord],
|
||||
scrubbedFields: Set[FieldId],
|
||||
) extends FoundAny
|
||||
|
||||
/**
|
||||
* Tweet data was found, however errors were detected in the stored data. Required
|
||||
* fields may be missing from the Tweet struct (e.g. CoreData), stored fields that
|
||||
* should be scrubbed remain present, or Tweets that should be hard-deleted remain
|
||||
* in storage. The errs array contains details of the problems.
|
||||
*/
|
||||
case class FoundWithErrors(
|
||||
tweet: Tweet,
|
||||
state: Option[TweetStateRecord],
|
||||
allStates: Seq[TweetStateRecord],
|
||||
scrubbedFields: Set[FieldId],
|
||||
errs: Seq[Error],
|
||||
) extends FoundAny
|
||||
with StoredTweetErrors
|
||||
}
|
||||
}
|
||||
|
||||
type HardDeleteTweet = TweetId => Stitch[HardDeleteTweet.Response]
|
||||
type SoftDelete = TweetId => Stitch[Unit]
|
||||
type BounceDelete = TweetId => Stitch[Unit]
|
||||
|
||||
object HardDeleteTweet {
|
||||
sealed trait Response
|
||||
object Response {
|
||||
case class Deleted(deletedAtMillis: Option[Long], createdAtMillis: Option[Long])
|
||||
extends Response
|
||||
case class NotDeleted(id: TweetId, ineligibleLKey: Option[TweetKey.LKey])
|
||||
extends Throwable
|
||||
with Response
|
||||
}
|
||||
}
|
||||
|
||||
type Undelete = TweetId => Stitch[Undelete.Response]
|
||||
object Undelete {
|
||||
case class Response(
|
||||
code: UndeleteResponseCode,
|
||||
tweet: Option[Tweet] = None,
|
||||
createdAtMillis: Option[Long] = None,
|
||||
archivedAtMillis: Option[Long] = None)
|
||||
|
||||
sealed trait UndeleteResponseCode
|
||||
|
||||
object UndeleteResponseCode {
|
||||
object Success extends UndeleteResponseCode
|
||||
object BackupNotFound extends UndeleteResponseCode
|
||||
object NotCreated extends UndeleteResponseCode
|
||||
}
|
||||
}
|
||||
|
||||
type AddTweet = Tweet => Stitch[Unit]
|
||||
type UpdateTweet = (Tweet, Seq[Field]) => Stitch[TweetResponse]
|
||||
type GetDeletedTweets = Seq[TweetId] => Stitch[Seq[DeletedTweetResponse]]
|
||||
type DeleteAdditionalFields = (Seq[TweetId], Seq[Field]) => Stitch[Seq[TweetResponse]]
|
||||
type Scrub = (Seq[TweetId], Seq[Field]) => Stitch[Unit]
|
||||
type Ping = () => Future[Unit]
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import scala.util.control.NoStackTrace
|
||||
|
||||
sealed abstract class TweetStorageException(message: String, cause: Throwable)
|
||||
extends Exception(message, cause)
|
||||
|
||||
/**
|
||||
* The request was not properly formed and failed an assertion present in the code. Should not be
|
||||
* retried without modification.
|
||||
*/
|
||||
case class ClientError(message: String, cause: Throwable)
|
||||
extends TweetStorageException(message, cause)
|
||||
with NoStackTrace
|
||||
|
||||
/**
|
||||
* Request was rejected by Manhattan or the in-process rate limiter. Should not be retried.
|
||||
*/
|
||||
case class RateLimited(message: String, cause: Throwable)
|
||||
extends TweetStorageException(message, cause)
|
||||
with NoStackTrace
|
||||
|
||||
/**
|
||||
* Corrupt tweets were requested from Manhattan
|
||||
*/
|
||||
case class VersionMismatchError(message: String, cause: Throwable = null)
|
||||
extends TweetStorageException(message, cause)
|
||||
with NoStackTrace
|
||||
|
||||
/**
|
||||
* All other unhandled exceptions.
|
||||
*/
|
||||
case class InternalError(message: String, cause: Throwable = null)
|
||||
extends TweetStorageException(message, cause)
|
|
@ -0,0 +1,265 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.logging.Logger
|
||||
import com.twitter.scrooge.TFieldBlob
|
||||
import com.twitter.snowflake.id.SnowflakeId
|
||||
import com.twitter.storage.client.manhattan.kv.DeniedManhattanException
|
||||
import com.twitter.storage.client.manhattan.kv.ManhattanException
|
||||
import com.twitter.tweetypie.storage.Response._
|
||||
import com.twitter.tweetypie.storage_internal.thriftscala.StoredTweet
|
||||
import com.twitter.util.Return
|
||||
import com.twitter.util.Throw
|
||||
import com.twitter.util.Try
|
||||
|
||||
object TweetUtils {
|
||||
val log: Logger = Logger("com.twitter.tweetypie.storage.TweetStorageLibrary")
|
||||
import FieldResponseCodec.ValueNotFoundException
|
||||
|
||||
/**
|
||||
* It's rare, but we have seen tweets with userId=0, which is likely the result of a
|
||||
* failed/partial delete. Treat these as invalid tweets, which are returned to callers
|
||||
* as not found.
|
||||
*/
|
||||
def isValid(tweet: StoredTweet): Boolean =
|
||||
tweet.userId.exists(_ != 0) && tweet.text.nonEmpty &&
|
||||
tweet.createdVia.nonEmpty && tweet.createdAtSec.nonEmpty
|
||||
|
||||
/**
|
||||
* Helper function to extract Scrubbed field Ids from the result returned by reading entire tweet prefix
|
||||
* function.
|
||||
*
|
||||
* @param records The sequence of MH records for the given tweetId
|
||||
*
|
||||
* @return The set of scrubbed field ids
|
||||
*/
|
||||
private[tweetypie] def extractScrubbedFields(records: Seq[TweetManhattanRecord]): Set[Short] =
|
||||
records
|
||||
.map(r => r.lkey)
|
||||
.collect { case TweetKey.LKey.ScrubbedFieldKey(fieldId) => fieldId }
|
||||
.toSet
|
||||
|
||||
private[tweetypie] val expectedFields =
|
||||
TweetFields.requiredFieldIds.toSet - TweetFields.tweetIdField
|
||||
|
||||
/**
|
||||
* Find the timestamp from a tweetId and a list of MH records. This is used when
|
||||
* you need a timestamp and you aren't sure that tweetId is a snowflake id.
|
||||
*
|
||||
* @param tweetId A tweetId you want the timestamp for.
|
||||
* @param records Tbird_mh records keyed on tweetId, one of which should be the
|
||||
* core fields record.
|
||||
* @return A milliseconds timestamp if one could be found.
|
||||
*/
|
||||
private[tweetypie] def creationTimeFromTweetIdOrMHRecords(
|
||||
tweetId: Long,
|
||||
records: Seq[TweetManhattanRecord]
|
||||
): Option[Long] =
|
||||
SnowflakeId
|
||||
.unixTimeMillisOptFromId(tweetId).orElse({
|
||||
records
|
||||
.find(_.lkey == TweetKey.LKey.CoreFieldsKey)
|
||||
.flatMap { coreFields =>
|
||||
CoreFieldsCodec
|
||||
.fromTFieldBlob(
|
||||
TFieldBlobCodec.fromByteBuffer(coreFields.value.contents)
|
||||
).createdAtSec.map(seconds => seconds * 1000)
|
||||
}
|
||||
})
|
||||
|
||||
/**
|
||||
* Helper function used to parse manhattan results for fields in a tweet (given in the form of
|
||||
* Sequence of (FieldKey, Try[Unit]) pairs) and build a TweetResponse object.
|
||||
*
|
||||
* @param callerName The name of the caller function. Used for error messages
|
||||
* @param tweetId Id of the Tweet for which TweetResponse is being built
|
||||
* @param fieldResults Sequence of (FieldKey, Try[Unit]).
|
||||
*
|
||||
* @return TweetResponse object
|
||||
*/
|
||||
private[tweetypie] def buildTweetResponse(
|
||||
callerName: String,
|
||||
tweetId: Long,
|
||||
fieldResults: Map[FieldId, Try[Unit]]
|
||||
): TweetResponse = {
|
||||
// Count Found/Not Found
|
||||
val successCount =
|
||||
fieldResults.foldLeft(0) {
|
||||
case (count, (_, Return(_))) => count + 1
|
||||
case (count, (_, Throw(_: ValueNotFoundException))) => count + 1
|
||||
case (count, _) => count
|
||||
}
|
||||
|
||||
val fieldResponsesMap = getFieldResponses(callerName, tweetId, fieldResults)
|
||||
|
||||
val overallCode = if (successCount > 0 && successCount == fieldResults.size) {
|
||||
TweetResponseCode.Success
|
||||
} else {
|
||||
|
||||
// If any field was rate limited, then we consider the entire tweet to be rate limited. So first we scan
|
||||
// the field results to check such an occurrence.
|
||||
val wasRateLimited = fieldResults.exists { fieldResult =>
|
||||
fieldResult._2 match {
|
||||
case Throw(e: DeniedManhattanException) => true
|
||||
case _ => false
|
||||
}
|
||||
}
|
||||
|
||||
// Were we rate limited for any of the additional fields?
|
||||
if (wasRateLimited) {
|
||||
TweetResponseCode.OverCapacity
|
||||
} else if (successCount == 0) {
|
||||
// successCount is < fieldResults.size at this point. So if allOrNone is true or
|
||||
// if successCount == 0 (i.e failed on all Fields), the overall code should be 'Failure'
|
||||
TweetResponseCode.Failure
|
||||
} else {
|
||||
// allOrNone == false AND successCount > 0 at this point. Clearly the overallCode should be Partial
|
||||
TweetResponseCode.Partial
|
||||
}
|
||||
}
|
||||
|
||||
TweetResponse(tweetId, overallCode, Some(fieldResponsesMap))
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to convert manhattan results into a Map[FieldId, FieldResponse]
|
||||
*
|
||||
* @param fieldResults Sequence of (TweetKey, TFieldBlob).
|
||||
*/
|
||||
private[tweetypie] def getFieldResponses(
|
||||
callerName: String,
|
||||
tweetId: TweetId,
|
||||
fieldResults: Map[FieldId, Try[_]]
|
||||
): Map[FieldId, FieldResponse] =
|
||||
fieldResults.map {
|
||||
case (fieldId, resp) =>
|
||||
def keyStr = TweetKey.fieldKey(tweetId, fieldId).toString
|
||||
resp match {
|
||||
case Return(_) =>
|
||||
fieldId -> FieldResponse(FieldResponseCode.Success, None)
|
||||
case Throw(mhException: ManhattanException) =>
|
||||
val errMsg = s"Exception in $callerName. Key: $keyStr. Error: $mhException"
|
||||
mhException match {
|
||||
case _: ValueNotFoundException => // ValueNotFound is not an error
|
||||
case _ => log.error(errMsg)
|
||||
}
|
||||
fieldId -> FieldResponseCodec.fromThrowable(mhException, Some(errMsg))
|
||||
case Throw(e) =>
|
||||
val errMsg = s"Exception in $callerName. Key: $keyStr. Error: $e"
|
||||
log.error(errMsg)
|
||||
fieldId -> FieldResponse(FieldResponseCode.Error, Some(errMsg))
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to build a TweetResponse object when being rate limited. Its possible that only some of the fields
|
||||
* got rate limited, so we indicate which fields got processed successfully, and which encountered some sort of error.
|
||||
*
|
||||
* @param tweetId Tweet id
|
||||
* @param callerName name of API calling this function
|
||||
* @param fieldResponses field responses for the case where
|
||||
*
|
||||
* @return The TweetResponse object
|
||||
*/
|
||||
private[tweetypie] def buildTweetOverCapacityResponse(
|
||||
callerName: String,
|
||||
tweetId: Long,
|
||||
fieldResponses: Map[FieldId, Try[Unit]]
|
||||
) = {
|
||||
val fieldResponsesMap = getFieldResponses(callerName, tweetId, fieldResponses)
|
||||
TweetResponse(tweetId, TweetResponseCode.OverCapacity, Some(fieldResponsesMap))
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a StoredTweet from a Seq of records. Core fields are handled specially.
|
||||
*/
|
||||
private[tweetypie] def buildStoredTweet(
|
||||
tweetId: TweetId,
|
||||
records: Seq[TweetManhattanRecord],
|
||||
includeScrubbed: Boolean = false,
|
||||
): StoredTweet = {
|
||||
getStoredTweetBlobs(records, includeScrubbed)
|
||||
.flatMap { fieldBlob =>
|
||||
// When fieldId == TweetFields.rootCoreFieldId, we have further work to do since the
|
||||
// 'value' is really serialized/packed version of all core fields. In this case we'll have
|
||||
// to unpack it into many TFieldBlobs.
|
||||
if (fieldBlob.id == TweetFields.rootCoreFieldId) {
|
||||
// We won't throw any error in this function and instead let the caller function handle this
|
||||
// condition (i.e If the caller function does not find any values for the core-fields in
|
||||
// the returned map, it should assume that the tweet is not found)
|
||||
CoreFieldsCodec.unpackFields(fieldBlob).values.toSeq
|
||||
} else {
|
||||
Seq(fieldBlob)
|
||||
}
|
||||
}.foldLeft(StoredTweet(tweetId))(_.setField(_))
|
||||
}
|
||||
|
||||
private[tweetypie] def buildValidStoredTweet(
|
||||
tweetId: TweetId,
|
||||
records: Seq[TweetManhattanRecord]
|
||||
): Option[StoredTweet] = {
|
||||
val storedTweet = buildStoredTweet(tweetId, records)
|
||||
if (storedTweet.getFieldBlobs(expectedFields).nonEmpty && isValid(storedTweet)) {
|
||||
Some(storedTweet)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a TFieldBlob for each StoredTweet field defined in this set of records.
|
||||
* @param includeScrubbed when false, result will not include scrubbed fields even
|
||||
* if the data is present in the set of records.
|
||||
*/
|
||||
private[tweetypie] def getStoredTweetBlobs(
|
||||
records: Seq[TweetManhattanRecord],
|
||||
includeScrubbed: Boolean = false,
|
||||
): Seq[TFieldBlob] = {
|
||||
val scrubbed = extractScrubbedFields(records)
|
||||
|
||||
records
|
||||
.flatMap { r =>
|
||||
// extract LKey.FieldKey records if they are not scrubbed and get their TFieldBlobs
|
||||
r.key match {
|
||||
case fullKey @ TweetKey(_, key: TweetKey.LKey.FieldKey)
|
||||
if includeScrubbed || !scrubbed.contains(key.fieldId) =>
|
||||
try {
|
||||
val fieldBlob = TFieldBlobCodec.fromByteBuffer(r.value.contents)
|
||||
if (fieldBlob.field.id != key.fieldId) {
|
||||
throw new AssertionError(
|
||||
s"Blob stored for $fullKey has unexpected id ${fieldBlob.field.id}"
|
||||
)
|
||||
}
|
||||
Some(fieldBlob)
|
||||
} catch {
|
||||
case e: VersionMismatchError =>
|
||||
log.error(
|
||||
s"Failed to decode bytebuffer for $fullKey: ${e.getMessage}"
|
||||
)
|
||||
throw e
|
||||
}
|
||||
case _ => None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Its important to bubble up rate limiting exceptions as they would likely be the root cause for other issues
|
||||
* (timeouts etc.), so we scan for this particular exception, and if found, we bubble that up specifically
|
||||
*
|
||||
* @param seqOfTries The sequence of tries which may contain within it a rate limit exception
|
||||
*
|
||||
* @return if a rate limiting exn was detected, this will be a Throw(e: DeniedManhattanException)
|
||||
* otherwise it will be a Return(_) only if all individual tries succeeded
|
||||
*/
|
||||
private[tweetypie] def collectWithRateLimitCheck(seqOfTries: Seq[Try[Unit]]): Try[Unit] = {
|
||||
val rateLimitThrowOpt = seqOfTries.find {
|
||||
case Throw(e: DeniedManhattanException) => true
|
||||
case _ => false
|
||||
}
|
||||
|
||||
rateLimitThrowOpt.getOrElse(
|
||||
Try.collect(seqOfTries).map(_ => ())
|
||||
) // Operation is considered successful only if all the deletions are successful
|
||||
}
|
||||
}
|
|
@ -0,0 +1,106 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.stitch.Stitch
|
||||
import com.twitter.tweetypie.storage.TweetStorageClient.Undelete
|
||||
import com.twitter.tweetypie.storage.TweetUtils._
|
||||
import com.twitter.util.Time
|
||||
|
||||
object UndeleteHandler {
|
||||
def apply(
|
||||
read: ManhattanOperations.Read,
|
||||
localInsert: ManhattanOperations.Insert,
|
||||
remoteInsert: ManhattanOperations.Insert,
|
||||
delete: ManhattanOperations.Delete,
|
||||
undeleteWindowHours: Int,
|
||||
stats: StatsReceiver
|
||||
): Undelete = {
|
||||
def withinUndeleteWindow(timestampMs: Long) =
|
||||
(Time.now - Time.fromMilliseconds(timestampMs)).inHours < undeleteWindowHours
|
||||
|
||||
def prepareUndelete(
|
||||
tweetId: TweetId,
|
||||
records: Seq[TweetManhattanRecord]
|
||||
): (Undelete.Response, Option[TweetManhattanRecord]) = {
|
||||
val undeleteRecord =
|
||||
Some(TweetStateRecord.Undeleted(tweetId, Time.now.inMillis).toTweetMhRecord)
|
||||
|
||||
TweetStateRecord.mostRecent(records) match {
|
||||
// check if we need to undo a soft deletion
|
||||
case Some(TweetStateRecord.SoftDeleted(_, createdAt)) =>
|
||||
if (createdAt > 0) {
|
||||
if (withinUndeleteWindow(createdAt)) {
|
||||
(
|
||||
mkSuccessfulUndeleteResponse(tweetId, records, Some(createdAt)),
|
||||
undeleteRecord
|
||||
)
|
||||
} else {
|
||||
(Undelete.Response(Undelete.UndeleteResponseCode.BackupNotFound), None)
|
||||
}
|
||||
} else {
|
||||
throw InternalError(s"Timestamp unavailable for $tweetId")
|
||||
}
|
||||
|
||||
// BounceDeleted tweets may not be undeleted. see go/bouncedtweet
|
||||
case Some(_: TweetStateRecord.HardDeleted | _: TweetStateRecord.BounceDeleted) =>
|
||||
(Undelete.Response(Undelete.UndeleteResponseCode.BackupNotFound), None)
|
||||
|
||||
case Some(_: TweetStateRecord.Undeleted) =>
|
||||
// We still want to write the undelete record, because at this point, we only know that the local DC's
|
||||
// winning record is not a soft/hard deletion record, while its possible that the remote DC's winning
|
||||
// record might still be a soft deletion record. Having said that, we don't want to set it to true
|
||||
// if the winning record is forceAdd, as the forceAdd call should have ensured that both DCs had the
|
||||
// forceAdd record.
|
||||
(mkSuccessfulUndeleteResponse(tweetId, records), undeleteRecord)
|
||||
|
||||
case Some(_: TweetStateRecord.ForceAdded) =>
|
||||
(mkSuccessfulUndeleteResponse(tweetId, records), None)
|
||||
|
||||
// lets write the undeletion record just in case there is a softdeletion record in flight
|
||||
case None => (mkSuccessfulUndeleteResponse(tweetId, records), undeleteRecord)
|
||||
}
|
||||
}
|
||||
|
||||
// Write the undelete record both locally and remotely to protect
|
||||
// against races with hard delete replication. We only need this
|
||||
// protection for the insertion of the undelete record.
|
||||
def multiInsert(record: TweetManhattanRecord): Stitch[Unit] =
|
||||
Stitch
|
||||
.collect(
|
||||
Seq(
|
||||
localInsert(record).liftToTry,
|
||||
remoteInsert(record).liftToTry
|
||||
)
|
||||
)
|
||||
.map(collectWithRateLimitCheck)
|
||||
.lowerFromTry
|
||||
|
||||
def deleteSoftDeleteRecord(tweetId: TweetId): Stitch[Unit] = {
|
||||
val mhKey = TweetKey.softDeletionStateKey(tweetId)
|
||||
delete(mhKey, None)
|
||||
}
|
||||
|
||||
tweetId =>
|
||||
for {
|
||||
records <- read(tweetId)
|
||||
(response, undeleteRecord) = prepareUndelete(tweetId, records)
|
||||
_ <- Stitch.collect(undeleteRecord.map(multiInsert)).unit
|
||||
_ <- deleteSoftDeleteRecord(tweetId)
|
||||
} yield {
|
||||
response
|
||||
}
|
||||
}
|
||||
|
||||
private[storage] def mkSuccessfulUndeleteResponse(
|
||||
tweetId: TweetId,
|
||||
records: Seq[TweetManhattanRecord],
|
||||
timestampOpt: Option[Long] = None
|
||||
) =
|
||||
Undelete.Response(
|
||||
Undelete.UndeleteResponseCode.Success,
|
||||
Some(
|
||||
StorageConversions.fromStoredTweet(buildStoredTweet(tweetId, records))
|
||||
),
|
||||
archivedAtMillis = timestampOpt
|
||||
)
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
package com.twitter.tweetypie.storage
|
||||
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.stitch.Stitch
|
||||
import com.twitter.storage.client.manhattan.kv.DeniedManhattanException
|
||||
import com.twitter.storage.client.manhattan.kv.ManhattanValue
|
||||
import com.twitter.tweetypie.storage.TweetUtils._
|
||||
import com.twitter.tweetypie.thriftscala.Tweet
|
||||
import com.twitter.util.Throw
|
||||
import com.twitter.util.Time
|
||||
|
||||
object UpdateTweetHandler {
|
||||
def apply(
|
||||
insert: ManhattanOperations.Insert,
|
||||
stats: StatsReceiver
|
||||
): TweetStorageClient.UpdateTweet = { (tpTweet: Tweet, fields: Seq[Field]) =>
|
||||
require(
|
||||
fields.forall(!TweetFields.coreFieldIds.contains(_)),
|
||||
"Core fields cannot be modified by calling updateTweet; use addTweet instead."
|
||||
)
|
||||
require(
|
||||
areAllFieldsDefined(tpTweet, fields),
|
||||
s"Input tweet $tpTweet does not have specified fields $fields set"
|
||||
)
|
||||
|
||||
val now = Time.now
|
||||
val storedTweet = StorageConversions.toStoredTweetForFields(tpTweet, fields.toSet)
|
||||
val tweetId = storedTweet.id
|
||||
Stats.updatePerFieldQpsCounters("updateTweet", fields.map(_.id), 1, stats)
|
||||
|
||||
val (fieldIds, stitchesPerTweet) =
|
||||
fields.map { field =>
|
||||
val fieldId = field.id
|
||||
val tweetKey = TweetKey.fieldKey(tweetId, fieldId)
|
||||
val blob = storedTweet.getFieldBlob(fieldId).get
|
||||
val value = ManhattanValue(TFieldBlobCodec.toByteBuffer(blob), Some(now))
|
||||
val record = TweetManhattanRecord(tweetKey, value)
|
||||
|
||||
(fieldId, insert(record).liftToTry)
|
||||
}.unzip
|
||||
|
||||
Stitch.collect(stitchesPerTweet).map { seqOfTries =>
|
||||
val fieldkeyAndMhResults = fieldIds.zip(seqOfTries).toMap
|
||||
// If even a single field was rate limited, we will send an overall OverCapacity TweetResponse
|
||||
val wasRateLimited = fieldkeyAndMhResults.exists { keyAndResult =>
|
||||
keyAndResult._2 match {
|
||||
case Throw(e: DeniedManhattanException) => true
|
||||
case _ => false
|
||||
}
|
||||
}
|
||||
|
||||
if (wasRateLimited) {
|
||||
buildTweetOverCapacityResponse("updateTweets", tweetId, fieldkeyAndMhResults)
|
||||
} else {
|
||||
buildTweetResponse("updateTweets", tweetId, fieldkeyAndMhResults)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private def areAllFieldsDefined(tpTweet: Tweet, fields: Seq[Field]) = {
|
||||
val storedTweet = StorageConversions.toStoredTweetForFields(tpTweet, fields.toSet)
|
||||
fields.map(_.id).forall(storedTweet.getFieldBlob(_).isDefined)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
package com.twitter.tweetypie
|
||||
|
||||
import com.twitter.storage.client.manhattan.kv.ManhattanValue
|
||||
import java.nio.ByteBuffer
|
||||
|
||||
package object storage {
|
||||
type TweetId = Long
|
||||
type FieldId = Short
|
||||
|
||||
type TweetManhattanValue = ManhattanValue[ByteBuffer]
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
compiler_option_sets = ["fatal_warnings"],
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"finagle/finagle-core/src/main",
|
||||
"flock-client/src/main/scala",
|
||||
"flock-client/src/main/thrift:thrift-scala",
|
||||
"tweetypie/servo/util/src/main/scala",
|
||||
"snowflake:id",
|
||||
"src/thrift/com/twitter/gizmoduck:thrift-scala",
|
||||
"src/thrift/com/twitter/servo:servo-exception-java",
|
||||
"tweetypie/common/src/thrift/com/twitter/tweetypie:tweet-scala",
|
||||
"tweetypie/server/src/main/scala/com/twitter/tweetypie",
|
||||
"tweetypie/server/src/main/scala/com/twitter/tweetypie/serverutil",
|
||||
"tweetypie/common/src/scala/com/twitter/tweetypie/util",
|
||||
"util/util-core:scala",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,532 @@
|
|||
/** Copyright 2010 Twitter, Inc. */
|
||||
package com.twitter.tweetypie
|
||||
package tflock
|
||||
|
||||
import com.twitter.finagle.stats.Counter
|
||||
import com.twitter.flockdb.client._
|
||||
import com.twitter.flockdb.client.thriftscala.Priority
|
||||
import com.twitter.snowflake.id.SnowflakeId
|
||||
import com.twitter.tweetypie.serverutil.StoredCard
|
||||
import com.twitter.tweetypie.thriftscala._
|
||||
import com.twitter.util.Future
|
||||
import scala.collection.mutable.ListBuffer
|
||||
|
||||
object TFlockIndexer {
|
||||
|
||||
/**
|
||||
* Printable names for some edge types currently defined in [[com.twitter.flockdb.client]].
|
||||
* Used to defined stats counters for adding edges.
|
||||
*/
|
||||
val graphNames: Map[Int, String] =
|
||||
Map(
|
||||
CardTweetsGraph.id -> "card_tweets",
|
||||
ConversationGraph.id -> "conversation",
|
||||
DirectedAtUserIdGraph.id -> "directed_at_user_id",
|
||||
InvitedUsersGraph.id -> "invited_users",
|
||||
MediaTimelineGraph.id -> "media_timeline",
|
||||
MentionsGraph.id -> "mentions",
|
||||
NarrowcastSentTweetsGraph.id -> "narrowcast_sent_tweets",
|
||||
NullcastedTweetsGraph.id -> "nullcasted_tweets",
|
||||
QuotersGraph.id -> "quoters",
|
||||
QuotesGraph.id -> "quotes",
|
||||
QuoteTweetsIndexGraph.id -> "quote_tweets_index",
|
||||
RepliesToTweetsGraph.id -> "replies_to_tweets",
|
||||
RetweetsByMeGraph.id -> "retweets_by_me",
|
||||
RetweetsGraph.id -> "retweets",
|
||||
RetweetsOfMeGraph.id -> "retweets_of_me",
|
||||
RetweetSourceGraph.id -> "retweet_source",
|
||||
TweetsRetweetedGraph.id -> "tweets_retweeted",
|
||||
UserTimelineGraph.id -> "user_timeline",
|
||||
CreatorSubscriptionTimelineGraph.id -> "creator_subscription_timeline",
|
||||
CreatorSubscriptionMediaTimelineGraph.id -> "creator_subscription_image_timeline",
|
||||
)
|
||||
|
||||
/**
|
||||
* On edge deletion, edges are either archived permanently or retained for 3 months, based on
|
||||
* the retention policy in the above confluence page.
|
||||
*
|
||||
* These two retention policies correspond to the two deletion techniques: archive and remove.
|
||||
* We call removeEdges for edges with a short retention policy and archiveEdges for edges with
|
||||
* a permanent retention policy.
|
||||
*/
|
||||
val graphsWithRemovedEdges: Seq[Int] =
|
||||
Seq(
|
||||
CardTweetsGraph.id,
|
||||
CuratedTimelineGraph.id,
|
||||
CuratedTweetsGraph.id,
|
||||
DirectedAtUserIdGraph.id,
|
||||
MediaTimelineGraph.id,
|
||||
MutedConversationsGraph.id,
|
||||
QuotersGraph.id,
|
||||
QuotesGraph.id,
|
||||
QuoteTweetsIndexGraph.id,
|
||||
ReportedTweetsGraph.id,
|
||||
RetweetsOfMeGraph.id,
|
||||
RetweetSourceGraph.id,
|
||||
SoftLikesGraph.id,
|
||||
TweetsRetweetedGraph.id,
|
||||
CreatorSubscriptionTimelineGraph.id,
|
||||
CreatorSubscriptionMediaTimelineGraph.id,
|
||||
)
|
||||
|
||||
/**
|
||||
* These edges should be left in place when bounced tweets are deleted.
|
||||
* These edges are removed during hard deletion.
|
||||
*
|
||||
* This is done so external teams (timelines) can execute on these edges for
|
||||
* tombstone feature.
|
||||
*/
|
||||
val bounceDeleteGraphIds: Set[Int] =
|
||||
Set(
|
||||
UserTimelineGraph.id,
|
||||
ConversationGraph.id
|
||||
)
|
||||
|
||||
def makeCounters(stats: StatsReceiver, operation: String): Map[Int, Counter] = {
|
||||
TFlockIndexer.graphNames
|
||||
.mapValues(stats.scope(_).counter(operation))
|
||||
.withDefaultValue(stats.scope("unknown").counter(operation))
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param backgroundIndexingPriority specifies the queue to use for
|
||||
* background indexing operations. This is useful for making the
|
||||
* effects of background indexing operations (such as deleting edges
|
||||
* for deleted Tweets) available sooner in testing scenarios
|
||||
* (end-to-end tests or development instances). It is set to
|
||||
* Priority.Low in production to reduce the load on high priority
|
||||
* queues that we use for prominently user-visible operations.
|
||||
*/
|
||||
class TFlockIndexer(
|
||||
tflock: TFlockClient,
|
||||
hasMedia: Tweet => Boolean,
|
||||
backgroundIndexingPriority: Priority,
|
||||
stats: StatsReceiver)
|
||||
extends TweetIndexer {
|
||||
private[this] val FutureNil = Future.Nil
|
||||
|
||||
private[this] val archiveCounters = TFlockIndexer.makeCounters(stats, "archive")
|
||||
private[this] val removeCounters = TFlockIndexer.makeCounters(stats, "remove")
|
||||
private[this] val insertCounters = TFlockIndexer.makeCounters(stats, "insert")
|
||||
private[this] val negateCounters = TFlockIndexer.makeCounters(stats, "negate")
|
||||
|
||||
private[this] val foregroundIndexingPriority: Priority = Priority.High
|
||||
|
||||
override def createIndex(tweet: Tweet): Future[Unit] =
|
||||
createEdges(tweet, isUndelete = false)
|
||||
|
||||
override def undeleteIndex(tweet: Tweet): Future[Unit] =
|
||||
createEdges(tweet, isUndelete = true)
|
||||
|
||||
private[this] case class PartitionedEdges(
|
||||
longRetention: Seq[ExecuteEdge[StatusGraph]] = Nil,
|
||||
shortRetention: Seq[ExecuteEdge[StatusGraph]] = Nil,
|
||||
negate: Seq[ExecuteEdge[StatusGraph]] = Nil,
|
||||
ignore: Seq[ExecuteEdge[StatusGraph]] = Nil)
|
||||
|
||||
private[this] def partitionEdgesForDelete(
|
||||
edges: Seq[ExecuteEdge[StatusGraph]],
|
||||
isBounceDelete: Boolean
|
||||
) =
|
||||
edges.foldLeft(PartitionedEdges()) {
|
||||
// Two dependees of UserTimelineGraph edge states to satisfy: timelines & safety tools.
|
||||
// Timelines show bounce-deleted tweets as tombstones; regular deletes are not shown.
|
||||
// - i.e. timelineIds = UserTimelineGraph(Normal || Negative)
|
||||
// Safety tools show deleted tweets to authorized internal review agents
|
||||
// - i.e. deletedIds = UserTimelineGraph(Removed || Negative)
|
||||
case (partitionedEdges, edge) if isBounceDelete && edge.graphId == UserTimelineGraph.id =>
|
||||
partitionedEdges.copy(negate = edge +: partitionedEdges.negate)
|
||||
|
||||
case (partitionedEdges, edge) if isBounceDelete && edge.graphId == ConversationGraph.id =>
|
||||
// Bounce-deleted tweets remain rendered as tombstones in conversations, so do not modify
|
||||
// the ConversationGraph edge state
|
||||
partitionedEdges.copy(ignore = edge +: partitionedEdges.ignore)
|
||||
|
||||
case (partitionedEdges, edge)
|
||||
if TFlockIndexer.graphsWithRemovedEdges.contains(edge.graphId) =>
|
||||
partitionedEdges.copy(shortRetention = edge +: partitionedEdges.shortRetention)
|
||||
|
||||
case (partitionedEdges, edge) =>
|
||||
partitionedEdges.copy(longRetention = edge +: partitionedEdges.longRetention)
|
||||
}
|
||||
|
||||
override def deleteIndex(tweet: Tweet, isBounceDelete: Boolean): Future[Unit] =
|
||||
for {
|
||||
edges <- getEdges(tweet, isCreate = false, isDelete = true, isUndelete = false)
|
||||
partitionedEdges = partitionEdgesForDelete(edges, isBounceDelete)
|
||||
() <-
|
||||
Future
|
||||
.join(
|
||||
tflock
|
||||
.archiveEdges(partitionedEdges.longRetention, backgroundIndexingPriority)
|
||||
.onSuccess(_ =>
|
||||
partitionedEdges.longRetention.foreach(e => archiveCounters(e.graphId).incr())),
|
||||
tflock
|
||||
.removeEdges(partitionedEdges.shortRetention, backgroundIndexingPriority)
|
||||
.onSuccess(_ =>
|
||||
partitionedEdges.shortRetention.foreach(e => removeCounters(e.graphId).incr())),
|
||||
tflock
|
||||
.negateEdges(partitionedEdges.negate, backgroundIndexingPriority)
|
||||
.onSuccess(_ =>
|
||||
partitionedEdges.negate.foreach(e => negateCounters(e.graphId).incr()))
|
||||
)
|
||||
.unit
|
||||
} yield ()
|
||||
|
||||
/**
|
||||
* This operation is called when a user is put into or taken out of
|
||||
* a state in which their retweets should no longer be visible
|
||||
* (e.g. suspended or ROPO).
|
||||
*/
|
||||
override def setRetweetVisibility(retweetId: TweetId, setVisible: Boolean): Future[Unit] = {
|
||||
val retweetEdge = Seq(ExecuteEdge(retweetId, RetweetsGraph, None, Reverse))
|
||||
|
||||
if (setVisible) {
|
||||
tflock
|
||||
.insertEdges(retweetEdge, backgroundIndexingPriority)
|
||||
.onSuccess(_ => insertCounters(RetweetsGraph.id).incr())
|
||||
} else {
|
||||
tflock
|
||||
.archiveEdges(retweetEdge, backgroundIndexingPriority)
|
||||
.onSuccess(_ => archiveCounters(RetweetsGraph.id).incr())
|
||||
}
|
||||
}
|
||||
|
||||
private[this] def createEdges(tweet: Tweet, isUndelete: Boolean): Future[Unit] =
|
||||
for {
|
||||
edges <- getEdges(tweet = tweet, isCreate = true, isDelete = false, isUndelete = isUndelete)
|
||||
() <- tflock.insertEdges(edges, foregroundIndexingPriority)
|
||||
} yield {
|
||||
// Count all the edges we've successfully added:
|
||||
edges.foreach(e => insertCounters(e.graphId).incr())
|
||||
}
|
||||
|
||||
private[this] def addRTEdges(
|
||||
tweet: Tweet,
|
||||
share: Share,
|
||||
isCreate: Boolean,
|
||||
edges: ListBuffer[ExecuteEdge[StatusGraph]],
|
||||
futureEdges: ListBuffer[Future[Seq[ExecuteEdge[StatusGraph]]]]
|
||||
): Unit = {
|
||||
|
||||
edges += RetweetsOfMeGraph.edge(share.sourceUserId, tweet.id)
|
||||
edges += RetweetsByMeGraph.edge(getUserId(tweet), tweet.id)
|
||||
edges += RetweetsGraph.edge(share.sourceStatusId, tweet.id)
|
||||
|
||||
if (isCreate) {
|
||||
edges += ExecuteEdge(
|
||||
sourceId = getUserId(tweet),
|
||||
graph = RetweetSourceGraph,
|
||||
destinationIds = Some(Seq(share.sourceStatusId)),
|
||||
direction = Forward,
|
||||
position = Some(SnowflakeId(tweet.id).time.inMillis)
|
||||
)
|
||||
edges.append(TweetsRetweetedGraph.edge(share.sourceUserId, share.sourceStatusId))
|
||||
} else {
|
||||
edges += RetweetSourceGraph.edge(getUserId(tweet), share.sourceStatusId)
|
||||
|
||||
// if this is the last retweet we need to remove it from the source user's
|
||||
// tweets retweeted graph
|
||||
futureEdges.append(
|
||||
tflock.count(RetweetsGraph.from(share.sourceStatusId)).flatMap { count =>
|
||||
if (count <= 1) {
|
||||
tflock.selectAll(RetweetsGraph.from(share.sourceStatusId)).map { tweets =>
|
||||
if (tweets.size <= 1)
|
||||
Seq(TweetsRetweetedGraph.edge(share.sourceUserId, share.sourceStatusId))
|
||||
else
|
||||
Nil
|
||||
}
|
||||
} else {
|
||||
FutureNil
|
||||
}
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
private[this] def addReplyEdges(
|
||||
tweet: Tweet,
|
||||
edges: ListBuffer[ExecuteEdge[StatusGraph]]
|
||||
): Unit = {
|
||||
getReply(tweet).foreach { reply =>
|
||||
reply.inReplyToStatusId.flatMap { inReplyToStatusId =>
|
||||
edges += RepliesToTweetsGraph.edge(inReplyToStatusId, tweet.id)
|
||||
|
||||
// only index conversationId if this is a reply to another tweet
|
||||
TweetLenses.conversationId.get(tweet).map { conversationId =>
|
||||
edges += ConversationGraph.edge(conversationId, tweet.id)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private[this] def addDirectedAtEdges(
|
||||
tweet: Tweet,
|
||||
edges: ListBuffer[ExecuteEdge[StatusGraph]]
|
||||
): Unit = {
|
||||
TweetLenses.directedAtUser.get(tweet).foreach { directedAtUser =>
|
||||
edges += DirectedAtUserIdGraph.edge(directedAtUser.userId, tweet.id)
|
||||
}
|
||||
}
|
||||
|
||||
private[this] def addMentionEdges(
|
||||
tweet: Tweet,
|
||||
edges: ListBuffer[ExecuteEdge[StatusGraph]]
|
||||
): Unit = {
|
||||
getMentions(tweet)
|
||||
.flatMap(_.userId).foreach { mention =>
|
||||
edges += MentionsGraph.edge(mention, tweet.id)
|
||||
}
|
||||
}
|
||||
|
||||
private[this] def addQTEdges(
|
||||
tweet: Tweet,
|
||||
edges: ListBuffer[ExecuteEdge[StatusGraph]],
|
||||
futureEdges: ListBuffer[Future[Seq[ExecuteEdge[StatusGraph]]]],
|
||||
isCreate: Boolean
|
||||
): Unit = {
|
||||
val userId = getUserId(tweet)
|
||||
|
||||
tweet.quotedTweet.foreach { quotedTweet =>
|
||||
// Regardless of tweet creates/deletes, we add the corresponding edges to the
|
||||
// following two graphs. Note that we're handling the case for
|
||||
// the QuotersGraph slightly differently in the tweet delete case.
|
||||
edges.append(QuotesGraph.edge(quotedTweet.userId, tweet.id))
|
||||
edges.append(QuoteTweetsIndexGraph.edge(quotedTweet.tweetId, tweet.id))
|
||||
if (isCreate) {
|
||||
// As mentioned above, for tweet creates we go ahead and add an edge
|
||||
// to the QuotersGraph without any additional checks.
|
||||
edges.append(QuotersGraph.edge(quotedTweet.tweetId, userId))
|
||||
} else {
|
||||
// For tweet deletes, we only add an edge to be deleted from the
|
||||
// QuotersGraph if the tweeting user isn't quoting the tweet anymore
|
||||
// i.e. if a user has quoted a tweet multiple times, we only delete
|
||||
// an edge from the QuotersGraph if they've deleted all the quotes,
|
||||
// otherwise an edge should exist by definition of what the QuotersGraph
|
||||
// represents.
|
||||
|
||||
// Note: There can be a potential edge case here due to a race condition
|
||||
// in the following scenario.
|
||||
// i) A quotes a tweet T twice resulting in tweets T1 and T2.
|
||||
// ii) There should exist edges in the QuotersGraph from T -> A and T1 <-> T, T2 <-> T in
|
||||
// the QuoteTweetsIndexGraph, but one of the edges haven't been written
|
||||
// to the QuoteTweetsIndex graph in TFlock yet.
|
||||
// iii) In this scenario, we shouldn't really be deleting an edge as we're doing below.
|
||||
// The approach that we're taking below is a "best effort" approach similar to what we
|
||||
// currently do for RTs.
|
||||
|
||||
// Find all the quotes of the quoted tweet from the quoting user
|
||||
val quotesFromQuotingUser = QuoteTweetsIndexGraph
|
||||
.from(quotedTweet.tweetId)
|
||||
.intersect(UserTimelineGraph.from(userId))
|
||||
futureEdges.append(
|
||||
tflock
|
||||
.count(quotesFromQuotingUser).flatMap { count =>
|
||||
// If this is the last quote of the quoted tweet from the quoting user,
|
||||
// we go ahead and delete the edge from the QuotersGraph.
|
||||
if (count <= 1) {
|
||||
tflock.selectAll(quotesFromQuotingUser).map { tweets =>
|
||||
if (tweets.size <= 1) {
|
||||
Seq(QuotersGraph.edge(quotedTweet.tweetId, userId))
|
||||
} else {
|
||||
Nil
|
||||
}
|
||||
}
|
||||
} else {
|
||||
FutureNil
|
||||
}
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private[this] def addCardEdges(
|
||||
tweet: Tweet,
|
||||
edges: ListBuffer[ExecuteEdge[StatusGraph]]
|
||||
): Unit = {
|
||||
// Note that we are indexing only the TOO "stored" cards
|
||||
// (cardUri=card://<cardId>). Rest of the cards are ignored here.
|
||||
tweet.cardReference
|
||||
.collect {
|
||||
case StoredCard(id) =>
|
||||
edges.append(CardTweetsGraph.edge(id, tweet.id))
|
||||
}.getOrElse(())
|
||||
}
|
||||
|
||||
// Note: on undelete, this method restores all archived edges, including those that may have
|
||||
// been archived prior to the delete. This is incorrect behavior but in practice rarely
|
||||
// causes problems, as undeletes are so rare.
|
||||
private[this] def addEdgesForDeleteOrUndelete(
|
||||
tweet: Tweet,
|
||||
edges: ListBuffer[ExecuteEdge[StatusGraph]]
|
||||
): Unit = {
|
||||
edges.appendAll(
|
||||
Seq(
|
||||
MentionsGraph.edges(tweet.id, None, Reverse),
|
||||
RepliesToTweetsGraph.edges(tweet.id, None)
|
||||
)
|
||||
)
|
||||
|
||||
// When we delete or undelete a conversation control root Tweet we want to archive or restore
|
||||
// all the edges in InvitedUsersGraph from the Tweet id.
|
||||
if (hasConversationControl(tweet) && isConversationRoot(tweet)) {
|
||||
edges.append(InvitedUsersGraph.edges(tweet.id, None))
|
||||
}
|
||||
}
|
||||
|
||||
private[this] def addSimpleEdges(
|
||||
tweet: Tweet,
|
||||
edges: ListBuffer[ExecuteEdge[StatusGraph]]
|
||||
): Unit = {
|
||||
if (TweetLenses.nullcast.get(tweet)) {
|
||||
edges.append(NullcastedTweetsGraph.edge(getUserId(tweet), tweet.id))
|
||||
} else if (TweetLenses.narrowcast.get(tweet).isDefined) {
|
||||
edges.append(NarrowcastSentTweetsGraph.edge(getUserId(tweet), tweet.id))
|
||||
} else {
|
||||
edges.append(UserTimelineGraph.edge(getUserId(tweet), tweet.id))
|
||||
|
||||
if (hasMedia(tweet))
|
||||
edges.append(MediaTimelineGraph.edge(getUserId(tweet), tweet.id))
|
||||
|
||||
// Index root creator subscription tweets.
|
||||
// Ignore replies because those are not necessarily visible to a user who subscribes to tweet author
|
||||
val isRootTweet: Boolean = tweet.coreData match {
|
||||
case Some(c) => c.reply.isEmpty && c.share.isEmpty
|
||||
case None => true
|
||||
}
|
||||
|
||||
if (tweet.exclusiveTweetControl.isDefined && isRootTweet) {
|
||||
edges.append(CreatorSubscriptionTimelineGraph.edge(getUserId(tweet), tweet.id))
|
||||
|
||||
if (hasMedia(tweet))
|
||||
edges.append(CreatorSubscriptionMediaTimelineGraph.edge(getUserId(tweet), tweet.id))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Issues edges for each mention of user in a conversation-controlled tweet. This way InvitedUsers
|
||||
* graph accumulates complete set of ids for @mention-invited users, by conversation id.
|
||||
*/
|
||||
private def invitedUsersEdgesForCreate(
|
||||
tweet: Tweet,
|
||||
edges: ListBuffer[ExecuteEdge[StatusGraph]]
|
||||
): Unit = {
|
||||
val conversationId: Long = getConversationId(tweet).getOrElse(tweet.id)
|
||||
val mentions: Seq[UserId] = getMentions(tweet).flatMap(_.userId)
|
||||
edges.appendAll(mentions.map(userId => InvitedUsersGraph.edge(conversationId, userId)))
|
||||
}
|
||||
|
||||
/**
|
||||
* Issues edges of InviteUsersGraph that ought to be deleted for a conversation controlled reply.
|
||||
* These are mentions of users in the given tweet, only if the user was not mentioned elsewhere
|
||||
* in the conversation. This way for a conversation, InvitedUsersGraph would always hold a set
|
||||
* of all users invited to the conversation, and an edge is removed only after the last mention of
|
||||
* a user is deleted.
|
||||
*/
|
||||
private def invitedUsersEdgesForDelete(
|
||||
tweet: Tweet,
|
||||
futureEdges: ListBuffer[Future[Seq[ExecuteEdge[StatusGraph]]]]
|
||||
): Unit = {
|
||||
getConversationId(tweet).foreach { conversationId: Long =>
|
||||
val mentions: Seq[UserId] = getMentions(tweet).flatMap(_.userId)
|
||||
mentions.foreach { userId =>
|
||||
val tweetIdsWithinConversation = ConversationGraph.from(conversationId)
|
||||
val tweetIdsThatMentionUser = MentionsGraph.from(userId)
|
||||
futureEdges.append(
|
||||
tflock
|
||||
.selectAll(
|
||||
query = tweetIdsThatMentionUser.intersect(tweetIdsWithinConversation),
|
||||
limit = Some(2), // Just need to know if it is >1 or <=1, so 2 are enough.
|
||||
pageSize = None // Provide default, otherwise Mockito complains
|
||||
).map { tweetIds: Seq[Long] =>
|
||||
if (tweetIds.size <= 1) {
|
||||
Seq(InvitedUsersGraph.edge(conversationId, userId))
|
||||
} else {
|
||||
Nil
|
||||
}
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private def hasInviteViaMention(tweet: Tweet): Boolean = {
|
||||
tweet.conversationControl match {
|
||||
case Some(ConversationControl.ByInvitation(controls)) =>
|
||||
controls.inviteViaMention.getOrElse(false)
|
||||
case Some(ConversationControl.Community(controls)) =>
|
||||
controls.inviteViaMention.getOrElse(false)
|
||||
case Some(ConversationControl.Followers(followers)) =>
|
||||
followers.inviteViaMention.getOrElse(false)
|
||||
case _ =>
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
private def hasConversationControl(tweet: Tweet): Boolean =
|
||||
tweet.conversationControl.isDefined
|
||||
|
||||
// If a Tweet has a ConversationControl, it must have a ConversationId associated with it so we
|
||||
// can compare the ConversationId with the current Tweet ID to determine if it's the root of the
|
||||
// conversation. See ConversationIdHydrator for more details
|
||||
private def isConversationRoot(tweet: Tweet): Boolean =
|
||||
getConversationId(tweet).get == tweet.id
|
||||
|
||||
private def addInvitedUsersEdges(
|
||||
tweet: Tweet,
|
||||
isCreate: Boolean,
|
||||
isUndelete: Boolean,
|
||||
edges: ListBuffer[ExecuteEdge[StatusGraph]],
|
||||
futureEdges: ListBuffer[Future[Seq[ExecuteEdge[StatusGraph]]]]
|
||||
): Unit = {
|
||||
if (hasConversationControl(tweet)) {
|
||||
if (isCreate) {
|
||||
if (isConversationRoot(tweet) && !isUndelete) {
|
||||
// For root Tweets, only add edges for original creates, not for undeletes.
|
||||
// Undeletes are handled by addEdgesForDeleteOrUndelete.
|
||||
invitedUsersEdgesForCreate(tweet, edges)
|
||||
}
|
||||
if (!isConversationRoot(tweet) && hasInviteViaMention(tweet)) {
|
||||
// For replies, only add edges when the conversation control is in inviteViaMention mode.
|
||||
invitedUsersEdgesForCreate(tweet, edges)
|
||||
}
|
||||
} else {
|
||||
if (!isConversationRoot(tweet)) {
|
||||
invitedUsersEdgesForDelete(tweet, futureEdges)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private[this] def getEdges(
|
||||
tweet: Tweet,
|
||||
isCreate: Boolean,
|
||||
isDelete: Boolean,
|
||||
isUndelete: Boolean
|
||||
): Future[Seq[ExecuteEdge[StatusGraph]]] = {
|
||||
val edges = ListBuffer[ExecuteEdge[StatusGraph]]()
|
||||
val futureEdges = ListBuffer[Future[Seq[ExecuteEdge[StatusGraph]]]]()
|
||||
|
||||
addSimpleEdges(tweet, edges)
|
||||
getShare(tweet) match {
|
||||
case Some(share) => addRTEdges(tweet, share, isCreate, edges, futureEdges)
|
||||
case _ =>
|
||||
addInvitedUsersEdges(tweet, isCreate, isUndelete, edges, futureEdges)
|
||||
addReplyEdges(tweet, edges)
|
||||
addDirectedAtEdges(tweet, edges)
|
||||
addMentionEdges(tweet, edges)
|
||||
addQTEdges(tweet, edges, futureEdges, isCreate)
|
||||
addCardEdges(tweet, edges)
|
||||
if (isDelete || isUndelete) {
|
||||
addEdgesForDeleteOrUndelete(tweet, edges)
|
||||
}
|
||||
}
|
||||
|
||||
Future
|
||||
.collect(futureEdges)
|
||||
.map { moreEdges => (edges ++= moreEdges.flatten).toList }
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
/** Copyright 2010 Twitter, Inc. */
|
||||
package com.twitter.tweetypie
|
||||
package tflock
|
||||
|
||||
import com.twitter.tweetypie.thriftscala.Tweet
|
||||
import com.twitter.util.Future
|
||||
|
||||
trait TweetIndexer {
|
||||
|
||||
/**
|
||||
* Called at tweet-creation time, this method should set up all relevant indices on the tweet.
|
||||
*/
|
||||
def createIndex(tweet: Tweet): Future[Unit] = Future.Unit
|
||||
|
||||
/**
|
||||
* Called at tweet-undelete time (which isn't yet handled), this method should
|
||||
* restore all relevant indices on the tweet.
|
||||
*/
|
||||
def undeleteIndex(tweet: Tweet): Future[Unit] = Future.Unit
|
||||
|
||||
/**
|
||||
* Called at tweet-delete time, this method should archive all relevant indices on the tweet.
|
||||
*/
|
||||
def deleteIndex(tweet: Tweet, isBounceDelete: Boolean): Future[Unit] = Future.Unit
|
||||
|
||||
/**
|
||||
* This method should archive or unarchive the retweet edge in TFlock RetweetsGraph.
|
||||
*/
|
||||
def setRetweetVisibility(retweetId: TweetId, visible: Boolean): Future[Unit] = Future.Unit
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
compiler_option_sets = ["fatal_warnings"],
|
||||
platform = "java8",
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"finagle/finagle-core/src/main",
|
||||
"scrooge/scrooge-core/src/main/scala",
|
||||
"tweetypie/common/src/thrift/com/twitter/tweetypie:service-scala",
|
||||
"util/util-core:scala",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,8 @@
|
|||
package com.twitter.tweetypie.thriftscala
|
||||
|
||||
import com.twitter.finagle.service.FailedService
|
||||
|
||||
class NotImplementedTweetService
|
||||
extends TweetService$FinagleClient(
|
||||
new FailedService(new UnsupportedOperationException("not implemented"))
|
||||
)
|
|
@ -0,0 +1,79 @@
|
|||
package com.twitter.tweetypie.thriftscala
|
||||
|
||||
import com.twitter.util.Future
|
||||
|
||||
/**
|
||||
* A trait for TweetService implementations that wrap an underlying
|
||||
* TweetService and need to modify only some of the methods.
|
||||
*/
|
||||
trait TweetServiceProxy extends TweetService.MethodPerEndpoint {
|
||||
protected def underlying: TweetService.MethodPerEndpoint
|
||||
|
||||
/**
|
||||
* Default implementation simply passes through the Future but logic can be added to wrap each
|
||||
* invocation to the underlying TweetService
|
||||
*/
|
||||
protected def wrap[A](f: => Future[A]): Future[A] =
|
||||
f
|
||||
|
||||
override def getTweets(request: GetTweetsRequest): Future[Seq[GetTweetResult]] =
|
||||
wrap(underlying.getTweets(request))
|
||||
|
||||
override def getTweetFields(request: GetTweetFieldsRequest): Future[Seq[GetTweetFieldsResult]] =
|
||||
wrap(underlying.getTweetFields(request))
|
||||
|
||||
override def getTweetCounts(request: GetTweetCountsRequest): Future[Seq[GetTweetCountsResult]] =
|
||||
wrap(underlying.getTweetCounts(request))
|
||||
|
||||
override def setAdditionalFields(request: SetAdditionalFieldsRequest): Future[Unit] =
|
||||
wrap(underlying.setAdditionalFields(request))
|
||||
|
||||
override def deleteAdditionalFields(request: DeleteAdditionalFieldsRequest): Future[Unit] =
|
||||
wrap(underlying.deleteAdditionalFields(request))
|
||||
|
||||
override def postTweet(request: PostTweetRequest): Future[PostTweetResult] =
|
||||
wrap(underlying.postTweet(request))
|
||||
|
||||
override def postRetweet(request: RetweetRequest): Future[PostTweetResult] =
|
||||
wrap(underlying.postRetweet(request))
|
||||
|
||||
override def unretweet(request: UnretweetRequest): Future[UnretweetResult] =
|
||||
wrap(underlying.unretweet(request))
|
||||
|
||||
override def getDeletedTweets(
|
||||
request: GetDeletedTweetsRequest
|
||||
): Future[Seq[GetDeletedTweetResult]] =
|
||||
wrap(underlying.getDeletedTweets(request))
|
||||
|
||||
override def deleteTweets(request: DeleteTweetsRequest): Future[Seq[DeleteTweetResult]] =
|
||||
wrap(underlying.deleteTweets(request))
|
||||
|
||||
override def updatePossiblySensitiveTweet(
|
||||
request: UpdatePossiblySensitiveTweetRequest
|
||||
): Future[Unit] =
|
||||
wrap(underlying.updatePossiblySensitiveTweet(request))
|
||||
|
||||
override def undeleteTweet(request: UndeleteTweetRequest): Future[UndeleteTweetResponse] =
|
||||
wrap(underlying.undeleteTweet(request))
|
||||
|
||||
override def eraseUserTweets(request: EraseUserTweetsRequest): Future[Unit] =
|
||||
wrap(underlying.eraseUserTweets(request))
|
||||
|
||||
override def incrTweetFavCount(request: IncrTweetFavCountRequest): Future[Unit] =
|
||||
wrap(underlying.incrTweetFavCount(request))
|
||||
|
||||
override def deleteLocationData(request: DeleteLocationDataRequest): Future[Unit] =
|
||||
wrap(underlying.deleteLocationData(request))
|
||||
|
||||
override def scrubGeo(request: GeoScrub): Future[Unit] =
|
||||
wrap(underlying.scrubGeo(request))
|
||||
|
||||
override def takedown(request: TakedownRequest): Future[Unit] =
|
||||
wrap(underlying.takedown(request))
|
||||
|
||||
override def flush(request: FlushRequest): Future[Unit] =
|
||||
wrap(underlying.flush(request))
|
||||
|
||||
override def incrTweetBookmarkCount(request: IncrTweetBookmarkCountRequest): Future[Unit] =
|
||||
wrap(underlying.incrTweetBookmarkCount(request))
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
compiler_option_sets = ["fatal_warnings"],
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"tweetypie/servo/util",
|
||||
"tweetypie/common/src/thrift/com/twitter/tweetypie:media-entity-scala",
|
||||
"tweetypie/common/src/thrift/com/twitter/tweetypie:tweet-scala",
|
||||
"tco-util",
|
||||
"tweetypie/common/src/scala/com/twitter/tweetypie/tweettext",
|
||||
"tweetypie/common/src/scala/com/twitter/tweetypie/util",
|
||||
"twitter-text/lib/java/src/main/java/com/twitter/twittertext",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,11 @@
|
|||
package com.twitter.tweetypie.thriftscala.entities
|
||||
|
||||
import com.twitter.tweetypie.thriftscala.CashtagEntity
|
||||
import com.twitter.tweetypie.tweettext.TextEntity
|
||||
|
||||
object CashtagTextEntity extends TextEntity[CashtagEntity] {
|
||||
override def fromIndex(entity: CashtagEntity): Short = entity.fromIndex
|
||||
override def toIndex(entity: CashtagEntity): Short = entity.toIndex
|
||||
override def move(entity: CashtagEntity, fromIndex: Short, toIndex: Short): CashtagEntity =
|
||||
entity.copy(fromIndex = fromIndex, toIndex = toIndex)
|
||||
}
|
|
@ -0,0 +1,118 @@
|
|||
package com.twitter.tweetypie.thriftscala.entities
|
||||
|
||||
import com.twitter.servo.data.Mutation
|
||||
import com.twitter.tco_util.TcoUrl
|
||||
import com.twitter.tweetypie.thriftscala._
|
||||
import com.twitter.tweetypie.thriftscala.entities.Implicits._
|
||||
import com.twitter.tweetypie.tweettext.PartialHtmlEncoding
|
||||
import com.twitter.tweetypie.tweettext.TextEntity
|
||||
import com.twitter.tweetypie.tweettext.TextModification
|
||||
import com.twitter.tweetypie.util.TweetLenses
|
||||
import com.twitter.twittertext.Extractor
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
/**
|
||||
* Contains functions to collect urls, mentions, hashtags, and cashtags from the text of tweets and messages
|
||||
*/
|
||||
object EntityExtractor {
|
||||
// We only use one configuration of com.twitter.twittertext.Extractor, so it's
|
||||
// OK to share one global reference. The only available
|
||||
// configuration option is whether to extract URLs without protocols
|
||||
// (defaults to true)
|
||||
private[this] val extractor = new Extractor
|
||||
|
||||
// The twitter-text library operates on unencoded text, but we store
|
||||
// and process HTML-encoded text. The TextModification returned
|
||||
// from this function contains the decoded text which we will operate on,
|
||||
// but also provides us with the ability to map the indices on
|
||||
// the twitter-text entities back to the entities on the encoded text.
|
||||
private val htmlEncodedTextToEncodeModification: String => TextModification =
|
||||
text =>
|
||||
PartialHtmlEncoding
|
||||
.decodeWithModification(text)
|
||||
.getOrElse(TextModification.identity(text))
|
||||
.inverse
|
||||
|
||||
private[this] val extractAllUrlsFromTextMod: TextModification => Seq[UrlEntity] =
|
||||
extractUrls(false)
|
||||
|
||||
val extractAllUrls: String => Seq[UrlEntity] =
|
||||
htmlEncodedTextToEncodeModification.andThen(extractAllUrlsFromTextMod)
|
||||
|
||||
private[this] val extractTcoUrls: TextModification => Seq[UrlEntity] =
|
||||
extractUrls(true)
|
||||
|
||||
private[this] def extractUrls(tcoOnly: Boolean): TextModification => Seq[UrlEntity] =
|
||||
mkEntityExtractor[UrlEntity](
|
||||
extractor.extractURLsWithIndices(_).asScala.filter { e =>
|
||||
if (tcoOnly) TcoUrl.isTcoUrl(e.getValue) else true
|
||||
},
|
||||
UrlEntity(_, _, _)
|
||||
)
|
||||
|
||||
private[this] val extractMentionsFromTextMod: TextModification => Seq[MentionEntity] =
|
||||
mkEntityExtractor[MentionEntity](
|
||||
extractor.extractMentionedScreennamesWithIndices(_).asScala,
|
||||
MentionEntity(_, _, _)
|
||||
)
|
||||
|
||||
val extractMentions: String => Seq[MentionEntity] =
|
||||
htmlEncodedTextToEncodeModification.andThen(extractMentionsFromTextMod)
|
||||
|
||||
private[this] val extractHashtagsFromTextMod: TextModification => Seq[HashtagEntity] =
|
||||
mkEntityExtractor[HashtagEntity](
|
||||
extractor.extractHashtagsWithIndices(_).asScala,
|
||||
HashtagEntity(_, _, _)
|
||||
)
|
||||
|
||||
val extractHashtags: String => Seq[HashtagEntity] =
|
||||
htmlEncodedTextToEncodeModification.andThen(extractHashtagsFromTextMod)
|
||||
|
||||
private[this] val extractCashtagsFromTextMod: TextModification => Seq[CashtagEntity] =
|
||||
mkEntityExtractor[CashtagEntity](
|
||||
extractor.extractCashtagsWithIndices(_).asScala,
|
||||
CashtagEntity(_, _, _)
|
||||
)
|
||||
|
||||
val extractCashtags: String => Seq[CashtagEntity] =
|
||||
htmlEncodedTextToEncodeModification.andThen(extractCashtagsFromTextMod)
|
||||
|
||||
private[this] def mkEntityExtractor[E: TextEntity](
|
||||
extract: String => Seq[Extractor.Entity],
|
||||
construct: (Short, Short, String) => E
|
||||
): TextModification => Seq[E] =
|
||||
htmlEncodedMod => {
|
||||
val convert: Extractor.Entity => Option[E] =
|
||||
e =>
|
||||
for {
|
||||
start <- asShort(e.getStart.intValue)
|
||||
end <- asShort(e.getEnd.intValue)
|
||||
if e.getValue != null
|
||||
res <- htmlEncodedMod.reindexEntity(construct(start, end, e.getValue))
|
||||
} yield res
|
||||
|
||||
val entities = extract(htmlEncodedMod.original)
|
||||
extractor.modifyIndicesFromUTF16ToUnicode(htmlEncodedMod.original, entities.asJava)
|
||||
entities.map(convert).flatten
|
||||
}
|
||||
|
||||
private[this] def asShort(i: Int): Option[Short] =
|
||||
if (i.isValidShort) Some(i.toShort) else None
|
||||
|
||||
private[this] def mutation(extractUrls: Boolean): Mutation[Tweet] =
|
||||
Mutation { tweet =>
|
||||
val htmlEncodedMod = htmlEncodedTextToEncodeModification(TweetLenses.text.get(tweet))
|
||||
|
||||
Some(
|
||||
tweet.copy(
|
||||
urls = if (extractUrls) Some(extractTcoUrls(htmlEncodedMod)) else tweet.urls,
|
||||
mentions = Some(extractMentionsFromTextMod(htmlEncodedMod)),
|
||||
hashtags = Some(extractHashtagsFromTextMod(htmlEncodedMod)),
|
||||
cashtags = Some(extractCashtagsFromTextMod(htmlEncodedMod))
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
val mutationWithoutUrls: Mutation[Tweet] = mutation(false)
|
||||
val mutationAll: Mutation[Tweet] = mutation(true)
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
package com.twitter.tweetypie.thriftscala.entities
|
||||
|
||||
import com.twitter.tweetypie.thriftscala.HashtagEntity
|
||||
import com.twitter.tweetypie.tweettext.TextEntity
|
||||
|
||||
object HashtagTextEntity extends TextEntity[HashtagEntity] {
|
||||
override def fromIndex(entity: HashtagEntity): Short = entity.fromIndex
|
||||
override def toIndex(entity: HashtagEntity): Short = entity.toIndex
|
||||
override def move(entity: HashtagEntity, fromIndex: Short, toIndex: Short): HashtagEntity =
|
||||
entity.copy(fromIndex = fromIndex, toIndex = toIndex)
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
package com.twitter.tweetypie.thriftscala.entities
|
||||
|
||||
object Implicits {
|
||||
implicit val hashtagTextEntity: HashtagTextEntity.type = HashtagTextEntity
|
||||
implicit val cashtagTextEntity: CashtagTextEntity.type = CashtagTextEntity
|
||||
implicit val mentionTextEntity: MentionTextEntity.type = MentionTextEntity
|
||||
implicit val urlTextEntity: UrlTextEntity.type = UrlTextEntity
|
||||
implicit val mediaTextEntity: MediaTextEntity.type = MediaTextEntity
|
||||
implicit val textRangeTextEntity: TextRangeEntityAdapter.type = TextRangeEntityAdapter
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
package com.twitter.tweetypie.thriftscala.entities
|
||||
|
||||
import com.twitter.tweetypie.thriftscala.MediaEntity
|
||||
import com.twitter.tweetypie.tweettext.TextEntity
|
||||
|
||||
object MediaTextEntity extends TextEntity[MediaEntity] {
|
||||
override def fromIndex(entity: MediaEntity): Short = entity.fromIndex
|
||||
override def toIndex(entity: MediaEntity): Short = entity.toIndex
|
||||
override def move(entity: MediaEntity, fromIndex: Short, toIndex: Short): MediaEntity =
|
||||
entity.copy(fromIndex = fromIndex, toIndex = toIndex)
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
package com.twitter.tweetypie.thriftscala.entities
|
||||
|
||||
import com.twitter.tweetypie.thriftscala.MentionEntity
|
||||
import com.twitter.tweetypie.tweettext.TextEntity
|
||||
|
||||
object MentionTextEntity extends TextEntity[MentionEntity] {
|
||||
override def fromIndex(entity: MentionEntity): Short = entity.fromIndex
|
||||
override def toIndex(entity: MentionEntity): Short = entity.toIndex
|
||||
override def move(entity: MentionEntity, fromIndex: Short, toIndex: Short): MentionEntity =
|
||||
entity.copy(fromIndex = fromIndex, toIndex = toIndex)
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
package com.twitter.tweetypie.thriftscala.entities
|
||||
|
||||
import com.twitter.tweetypie.thriftscala.TextRange
|
||||
import com.twitter.tweetypie.tweettext.TextEntity
|
||||
|
||||
object TextRangeEntityAdapter extends TextEntity[TextRange] {
|
||||
override def fromIndex(entity: TextRange): Short = entity.fromIndex.toShort
|
||||
override def toIndex(entity: TextRange): Short = entity.toIndex.toShort
|
||||
override def move(entity: TextRange, fromIndex: Short, toIndex: Short): TextRange =
|
||||
entity.copy(fromIndex = fromIndex, toIndex = toIndex)
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
package com.twitter.tweetypie.thriftscala.entities
|
||||
|
||||
import com.twitter.tweetypie.thriftscala.UrlEntity
|
||||
import com.twitter.tweetypie.tweettext.TextEntity
|
||||
|
||||
object UrlTextEntity extends TextEntity[UrlEntity] {
|
||||
override def fromIndex(entity: UrlEntity): Short = entity.fromIndex
|
||||
override def toIndex(entity: UrlEntity): Short = entity.toIndex
|
||||
override def move(entity: UrlEntity, fromIndex: Short, toIndex: Short): UrlEntity =
|
||||
entity.copy(fromIndex = fromIndex, toIndex = toIndex)
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
compiler_option_sets = ["fatal_warnings"],
|
||||
platform = "java8",
|
||||
provides = scala_artifact(
|
||||
org = "com.twitter",
|
||||
name = "tweetypie-tweettext",
|
||||
repo = artifactory,
|
||||
),
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/ibm/icu:icu4j",
|
||||
"twitter-text/lib/java/src/main/java/com/twitter/twittertext",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,44 @@
|
|||
package com.twitter.tweetypie.tweettext
|
||||
|
||||
import com.ibm.icu.text.BreakIterator
|
||||
|
||||
/**
|
||||
* Adapt the [[BreakIterator]] interface to a scala [[Iterator]]
|
||||
* over the offsets of user-perceived characters in a String.
|
||||
*/
|
||||
object GraphemeIndexIterator {
|
||||
|
||||
/**
|
||||
* Produce an iterator over indices in the string that mark the end
|
||||
* of a user-perceived character (grapheme)
|
||||
*/
|
||||
def ends(s: String): Iterator[Offset.CodeUnit] =
|
||||
// The start of every grapheme but the first is also a grapheme
|
||||
// end. The last grapheme ends at the end of the string.
|
||||
starts(s).drop(1) ++ Iterator(Offset.CodeUnit.length(s))
|
||||
|
||||
/**
|
||||
* Produce an iterator over indices in the string that mark the start
|
||||
* of a user-perceived character (grapheme)
|
||||
*/
|
||||
def starts(s: String): Iterator[Offset.CodeUnit] =
|
||||
new Iterator[Offset.CodeUnit] {
|
||||
private[this] val it = BreakIterator.getCharacterInstance()
|
||||
|
||||
it.setText(s)
|
||||
|
||||
override def hasNext: Boolean = it.current < s.length
|
||||
|
||||
override def next: Offset.CodeUnit = {
|
||||
if (!hasNext) throw new IllegalArgumentException(s"${it.current()}, ${s.length}")
|
||||
|
||||
// No matter what, we will be returning the value of `current`,
|
||||
// which is the index of the start of the next grapheme.
|
||||
val result = it.current()
|
||||
|
||||
it.next()
|
||||
|
||||
Offset.CodeUnit(result)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,85 @@
|
|||
package com.twitter.tweetypie.tweettext
|
||||
|
||||
/**
|
||||
* An efficient converter of indices between code points and code units.
|
||||
*/
|
||||
class IndexConverter(text: String) {
|
||||
// Keep track of a single corresponding pair of code unit and code point
|
||||
// offsets so that we can re-use counting work if the next requested
|
||||
// entity is near the most recent entity.
|
||||
private var codePointIndex = 0
|
||||
// The code unit index should never split a surrogate pair.
|
||||
private var charIndex = 0
|
||||
|
||||
/**
|
||||
* @param offset Index into the string measured in code units.
|
||||
* @return The code point index that corresponds to the specified character index.
|
||||
*/
|
||||
def toCodePoints(offset: Offset.CodeUnit): Offset.CodePoint =
|
||||
Offset.CodePoint(codeUnitsToCodePoints(offset.toInt))
|
||||
|
||||
/**
|
||||
* @param charIndex Index into the string measured in code units.
|
||||
* @return The code point index that corresponds to the specified character index.
|
||||
*/
|
||||
def codeUnitsToCodePoints(charIndex: Int): Int = {
|
||||
if (charIndex < this.charIndex) {
|
||||
this.codePointIndex -= text.codePointCount(charIndex, this.charIndex)
|
||||
} else {
|
||||
this.codePointIndex += text.codePointCount(this.charIndex, charIndex)
|
||||
}
|
||||
this.charIndex = charIndex
|
||||
|
||||
// Make sure that charIndex never points to the second code unit of a
|
||||
// surrogate pair.
|
||||
if (charIndex > 0 && Character.isSupplementaryCodePoint(text.codePointAt(charIndex - 1))) {
|
||||
this.charIndex -= 1
|
||||
this.codePointIndex -= 1
|
||||
}
|
||||
|
||||
this.codePointIndex
|
||||
}
|
||||
|
||||
/**
|
||||
* @param offset Index into the string measured in code points.
|
||||
* @return the corresponding code unit index
|
||||
*/
|
||||
def toCodeUnits(offset: Offset.CodePoint): Offset.CodeUnit = {
|
||||
this.charIndex = text.offsetByCodePoints(charIndex, offset.toInt - this.codePointIndex)
|
||||
this.codePointIndex = offset.toInt
|
||||
Offset.CodeUnit(this.charIndex)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param codePointIndex Index into the string measured in code points.
|
||||
* @return the corresponding code unit index
|
||||
*/
|
||||
def codePointsToCodeUnits(codePointIndex: Int): Int =
|
||||
toCodeUnits(Offset.CodePoint(codePointIndex)).toInt
|
||||
|
||||
/**
|
||||
* Returns a substring which begins at the specified code point `from` and extends to the
|
||||
* code point `to`. Since String.substring only works with character, the method first
|
||||
* converts code point offset to code unit offset.
|
||||
*/
|
||||
def substring(from: Offset.CodePoint, to: Offset.CodePoint): String =
|
||||
text.substring(toCodeUnits(from).toInt, toCodeUnits(to).toInt)
|
||||
|
||||
/**
|
||||
* Returns a substring which begins at the specified code point `from` and extends to the
|
||||
* code point `to`. Since String.substring only works with character, the method first
|
||||
* converts code point offset to code unit offset.
|
||||
*/
|
||||
def substringByCodePoints(from: Int, to: Int): String =
|
||||
substring(Offset.CodePoint(from), Offset.CodePoint(to))
|
||||
|
||||
/**
|
||||
* Returns a substring which begins at the specified code point `from` and extends to the
|
||||
* end of the string. Since String.substring only works with character, the method first
|
||||
* converts code point offset to code unit offset.
|
||||
*/
|
||||
def substringByCodePoints(from: Int): String = {
|
||||
val charFrom = codePointsToCodeUnits(from)
|
||||
text.substring(charFrom)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,253 @@
|
|||
package com.twitter.tweetypie.tweettext
|
||||
import scala.collection.immutable
|
||||
|
||||
/**
|
||||
* An Offset is a typed index into a String.
|
||||
*/
|
||||
trait Offset[T] extends Ordering[T] {
|
||||
def toInt(t: T): Int
|
||||
def count(text: String, start: Offset.CodeUnit, end: Offset.CodeUnit): T
|
||||
|
||||
def compare(t1: T, t2: T): Int = toInt(t1).compare(toInt(t2))
|
||||
def length(input: String): T = count(input, Offset.CodeUnit(0), Offset.CodeUnit.length(input))
|
||||
}
|
||||
|
||||
object Offset {
|
||||
|
||||
/**
|
||||
* UTF-16 code unit offsets are the native offsets for Java/Scala
|
||||
* Strings.
|
||||
*/
|
||||
case class CodeUnit(toInt: Int) extends AnyVal with Ordered[CodeUnit] {
|
||||
def compare(other: CodeUnit): Int = toInt.compare(other.toInt)
|
||||
def +(other: CodeUnit) = CodeUnit(toInt + other.toInt)
|
||||
def -(other: CodeUnit) = CodeUnit(toInt - other.toInt)
|
||||
def min(other: CodeUnit): CodeUnit = if (toInt < other.toInt) this else other
|
||||
def max(other: CodeUnit): CodeUnit = if (toInt > other.toInt) this else other
|
||||
def incr: CodeUnit = CodeUnit(toInt + 1)
|
||||
def decr: CodeUnit = CodeUnit(toInt - 1)
|
||||
def until(end: CodeUnit): immutable.IndexedSeq[CodeUnit] =
|
||||
toInt.until(end.toInt).map(CodeUnit(_))
|
||||
|
||||
/**
|
||||
* Converts this `CodeUnit` to the equivalent `CodePoint` within the
|
||||
* given text.
|
||||
*/
|
||||
def toCodePoint(text: String): CodePoint =
|
||||
CodePoint(text.codePointCount(0, toInt))
|
||||
|
||||
def offsetByCodePoints(text: String, codePoints: CodePoint): CodeUnit =
|
||||
CodeUnit(text.offsetByCodePoints(toInt, codePoints.toInt))
|
||||
}
|
||||
|
||||
implicit object CodeUnit extends Offset[CodeUnit] {
|
||||
def toInt(u: CodeUnit): Int = u.toInt
|
||||
override def length(text: String): CodeUnit = CodeUnit(text.length)
|
||||
def count(text: String, start: CodeUnit, end: CodeUnit): CodeUnit = end - start
|
||||
}
|
||||
|
||||
/**
|
||||
* Offsets in whole Unicode code points. Any CodePoint is a valid
|
||||
* offset into the String as long as it is >= 0 and less than the
|
||||
* number of code points in the string.
|
||||
*/
|
||||
case class CodePoint(toInt: Int) extends AnyVal with Ordered[CodePoint] {
|
||||
def toShort: Short = toInt.toShort
|
||||
def compare(other: CodePoint): Int = toInt.compare(other.toInt)
|
||||
def +(other: CodePoint) = CodePoint(toInt + other.toInt)
|
||||
def -(other: CodePoint) = CodePoint(toInt - other.toInt)
|
||||
def min(other: CodePoint): CodePoint = if (toInt < other.toInt) this else other
|
||||
def max(other: CodePoint): CodePoint = if (toInt > other.toInt) this else other
|
||||
def until(end: CodePoint): immutable.IndexedSeq[CodePoint] =
|
||||
toInt.until(end.toInt).map(CodePoint(_))
|
||||
|
||||
def toCodeUnit(text: String): CodeUnit =
|
||||
CodeUnit(text.offsetByCodePoints(0, toInt))
|
||||
}
|
||||
|
||||
implicit object CodePoint extends Offset[CodePoint] {
|
||||
def toInt(p: CodePoint): Int = p.toInt
|
||||
|
||||
def count(text: String, start: CodeUnit, end: CodeUnit): CodePoint =
|
||||
CodePoint(text.codePointCount(start.toInt, end.toInt))
|
||||
}
|
||||
|
||||
/**
|
||||
* Offsets into the String as if the String were encoded as UTF-8. You
|
||||
* cannot use a [[Utf8]] offset to index a String, because not all
|
||||
* Utf8 indices are valid indices into the String.
|
||||
*/
|
||||
case class Utf8(toInt: Int) extends AnyVal with Ordered[Utf8] {
|
||||
def compare(other: Utf8): Int = toInt.compare(other.toInt)
|
||||
def +(other: Utf8) = Utf8(toInt + other.toInt)
|
||||
def -(other: Utf8) = Utf8(toInt - other.toInt)
|
||||
def min(other: Utf8): Utf8 = if (toInt < other.toInt) this else other
|
||||
def max(other: Utf8): Utf8 = if (toInt > other.toInt) this else other
|
||||
}
|
||||
|
||||
implicit object Utf8 extends Offset[Utf8] {
|
||||
def toInt(u: Utf8): Int = u.toInt
|
||||
|
||||
/**
|
||||
* Count how many bytes this section of text would be when encoded as
|
||||
* UTF-8.
|
||||
*/
|
||||
def count(s: String, start: CodeUnit, end: CodeUnit): Utf8 = {
|
||||
def go(i: CodeUnit, byteLength: Utf8): Utf8 =
|
||||
if (i < end) {
|
||||
val cp = s.codePointAt(i.toInt)
|
||||
go(i + CodeUnit(Character.charCount(cp)), byteLength + forCodePoint(cp))
|
||||
} else {
|
||||
byteLength
|
||||
}
|
||||
|
||||
go(start, Utf8(0))
|
||||
}
|
||||
|
||||
/**
|
||||
* Unfortunately, there is no convenient API for finding out how many
|
||||
* bytes a unicode code point would take in UTF-8, so we have to
|
||||
* explicitly calculate it.
|
||||
*
|
||||
* @see http://en.wikipedia.org/wiki/UTF-8#Description
|
||||
*/
|
||||
def forCodePoint(cp: Int): Utf8 =
|
||||
Utf8 {
|
||||
// if the code point is an unpaired surrogate, it will be converted
|
||||
// into a 1 byte replacement character
|
||||
if (Character.getType(cp) == Character.SURROGATE) 1
|
||||
else {
|
||||
cp match {
|
||||
case _ if cp < 0x80 => 1
|
||||
case _ if cp < 0x800 => 2
|
||||
case _ if cp < 0x10000 => 3
|
||||
case _ => 4
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Display units count what we consider a "character" in a
|
||||
* Tweet. [[DisplayUnit]] offsets are only valid for text that is
|
||||
* NFC-normalized (See: http://www.unicode.org/reports/tr15) and
|
||||
* HTML-encoded, though this interface cannot enforce that.
|
||||
*
|
||||
* Currently, a [[DisplayUnit]] is equivalent to a single Unicode code
|
||||
* point combined with treating "<", ">", and "&" each as a
|
||||
* single character (since they are displayed as '<', '>', and '&'
|
||||
* respectively). This implementation is not directly exposed.
|
||||
*
|
||||
* It should be possible to change this definition without breaking
|
||||
* code that uses the [[DisplayUnit]] interface e.g. to count
|
||||
* user-perceived characters (graphemes) rather than code points,
|
||||
* though any change has to be made in concert with changing the
|
||||
* mobile client and Web implementations so that the user experience
|
||||
* of character counting remains consistent.
|
||||
*/
|
||||
case class DisplayUnit(toInt: Int) extends AnyVal with Ordered[DisplayUnit] {
|
||||
def compare(other: DisplayUnit): Int = toInt.compare(other.toInt)
|
||||
def +(other: DisplayUnit) = DisplayUnit(toInt + other.toInt)
|
||||
def -(other: DisplayUnit) = DisplayUnit(toInt - other.toInt)
|
||||
def min(other: DisplayUnit): DisplayUnit = if (toInt < other.toInt) this else other
|
||||
def max(other: DisplayUnit): DisplayUnit = if (toInt > other.toInt) this else other
|
||||
}
|
||||
|
||||
implicit object DisplayUnit extends Offset[DisplayUnit] {
|
||||
def toInt(d: DisplayUnit): Int = d.toInt
|
||||
|
||||
/**
|
||||
* Returns the number of display units in the specified range of the
|
||||
* given text. See [[DisplayUnit]] for a descrption of what we
|
||||
* consider a display unit.
|
||||
*
|
||||
* The input string should already be NFC normalized to get
|
||||
* consistent results. If partially html encoded, it will correctly
|
||||
* count html entities as a single display unit.
|
||||
*
|
||||
* @param text the string containing the characters to count.
|
||||
* @param the index to the first char of the text range
|
||||
* @param the index after the last char of the text range.
|
||||
*/
|
||||
def count(text: String, start: CodeUnit, end: CodeUnit): DisplayUnit = {
|
||||
val stop = end.min(CodeUnit.length(text))
|
||||
|
||||
@annotation.tailrec
|
||||
def go(offset: CodeUnit, total: DisplayUnit): DisplayUnit =
|
||||
if (offset >= stop) total
|
||||
else go(offset + at(text, offset), total + DisplayUnit(1))
|
||||
|
||||
go(start, DisplayUnit(0))
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the length of the display unit at the specified offset in
|
||||
* the (NFC-normalized, HTML-encoded) text.
|
||||
*/
|
||||
def at(text: String, offset: CodeUnit): CodeUnit =
|
||||
CodeUnit {
|
||||
text.codePointAt(offset.toInt) match {
|
||||
case '&' =>
|
||||
if (text.regionMatches(offset.toInt, "&", 0, 5)) 5
|
||||
else if (text.regionMatches(offset.toInt, "<", 0, 4)) 4
|
||||
else if (text.regionMatches(offset.toInt, ">", 0, 4)) 4
|
||||
else 1
|
||||
|
||||
case cp => Character.charCount(cp)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Ranges of offsets, useful for avoiding slicing entities.
|
||||
*/
|
||||
sealed trait Ranges[T] {
|
||||
def contains(t: T): Boolean
|
||||
}
|
||||
|
||||
object Ranges {
|
||||
private[this] case class Impl[T](toSeq: Seq[(T, T)])(implicit off: Offset[T])
|
||||
extends Ranges[T] {
|
||||
def contains(t: T): Boolean = toSeq.exists { case (lo, hi) => off.gt(t, lo) && off.lt(t, hi) }
|
||||
}
|
||||
|
||||
/**
|
||||
* Non-inclusive range of offsets (matches values that are strictly
|
||||
* between `hi` and `lo`)
|
||||
*/
|
||||
def between[T](lo: T, hi: T)(implicit off: Offset[T]): Ranges[T] =
|
||||
if (off.toInt(hi) > off.toInt(lo) + 1 && off.toInt(lo) < Int.MaxValue) Impl(Seq((lo, hi)))
|
||||
else Impl(Nil)
|
||||
|
||||
/**
|
||||
* The union of all of the specified ranges.
|
||||
*/
|
||||
def all[T](ranges: Seq[Ranges[T]])(implicit off: Offset[T]): Ranges[T] =
|
||||
Impl(
|
||||
// Preprocess the ranges so that each contains check is as cheap
|
||||
// as possible.
|
||||
ranges
|
||||
.flatMap { case r: Impl[T] => r.toSeq }
|
||||
.sortBy(_._1)
|
||||
.foldLeft(Nil: List[(T, T)]) {
|
||||
case ((a, b) :: out, (c, d)) if off.lt(c, b) => (a, d) :: out
|
||||
case (out, r) => r :: out
|
||||
}
|
||||
)
|
||||
|
||||
def Empty[T: Offset]: Ranges[T] = Impl[T](Nil)
|
||||
|
||||
private[this] val HtmlEscapes = """&(?:amp|lt|gt);""".r
|
||||
|
||||
/**
|
||||
* Match [[CodeUnit]]s that would split a HTML entity.
|
||||
*/
|
||||
def htmlEntities(s: String): Ranges[CodeUnit] = {
|
||||
val it = HtmlEscapes.findAllIn(s)
|
||||
all(it.map(_ => between(CodeUnit(it.start), CodeUnit(it.end))).toSeq)
|
||||
}
|
||||
|
||||
def fromCodePointPairs(pairs: Seq[(Int, Int)]): Ranges[CodePoint] =
|
||||
all(pairs.map { case (lo, hi) => between(CodePoint(lo), CodePoint(hi)) })
|
||||
}
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
package com.twitter.tweetypie.tweettext
|
||||
|
||||
/**
|
||||
* Code used to convert raw user-provided text into an allowable form.
|
||||
*/
|
||||
object PartialHtmlEncoding {
|
||||
|
||||
/**
|
||||
* Replaces all `<`, `>`, and '&' chars with "<", ">", and "&", respectively.
|
||||
*
|
||||
* Tweet text is HTML-encoded at tweet creation time, and is stored and processed in encoded form.
|
||||
*/
|
||||
def encode(text: String): String = {
|
||||
val buf = new StringBuilder
|
||||
|
||||
text.foreach {
|
||||
case '<' => buf.append("<")
|
||||
case '>' => buf.append(">")
|
||||
case '&' => buf.append("&")
|
||||
case c => buf.append(c)
|
||||
}
|
||||
|
||||
buf.toString
|
||||
}
|
||||
|
||||
private val AmpLtRegex = "<".r
|
||||
private val AmpGtRegex = ">".r
|
||||
private val AmpAmpRegex = "&".r
|
||||
|
||||
private val partialHtmlDecoder: (String => String) =
|
||||
((s: String) => AmpLtRegex.replaceAllIn(s, "<"))
|
||||
.andThen(s => AmpGtRegex.replaceAllIn(s, ">"))
|
||||
.andThen(s => AmpAmpRegex.replaceAllIn(s, "&"))
|
||||
|
||||
/**
|
||||
* The opposite of encode, it replaces all "<", ">", and "&" with
|
||||
* `<`, `>`, and '&', respectively.
|
||||
*/
|
||||
def decode(text: String): String =
|
||||
decodeWithModification(text) match {
|
||||
case Some(mod) => mod.updated
|
||||
case None => text
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes encoded entities, and returns a `TextModification` if the text was modified.
|
||||
*/
|
||||
def decodeWithModification(text: String): Option[TextModification] =
|
||||
TextModification.replaceAll(
|
||||
text,
|
||||
AmpLtRegex -> "<",
|
||||
AmpGtRegex -> ">",
|
||||
AmpAmpRegex -> "&"
|
||||
)
|
||||
}
|
|
@ -0,0 +1,251 @@
|
|||
package com.twitter.tweetypie.tweettext
|
||||
import scala.util.matching.Regex
|
||||
|
||||
/**
|
||||
* Code used to convert raw user-provided text into an allowable form.
|
||||
*/
|
||||
object Preprocessor {
|
||||
import TweetText._
|
||||
import TextModification.replaceAll
|
||||
|
||||
/**
|
||||
* Regex for dos-style line endings.
|
||||
*/
|
||||
val DosLineEndingRegex: Regex = """\r\n""".r
|
||||
|
||||
/**
|
||||
* Converts \r\n to just \n.
|
||||
*/
|
||||
def normalizeNewlines(text: String): String =
|
||||
DosLineEndingRegex.replaceAllIn(text, "\n")
|
||||
|
||||
/**
|
||||
* Characters to strip out of tweet text at write-time.
|
||||
*/
|
||||
val unicodeCharsToStrip: Seq[Char] =
|
||||
Seq(
|
||||
'\uFFFE', '\uFEFF', // BOM
|
||||
'\uFFFF', // Special
|
||||
'\u200E', '\u200F', // ltr, rtl
|
||||
'\u202A', '\u202B', '\u202C', '\u202D', '\u202E', // Directional change
|
||||
'\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008',
|
||||
'\u0009', '\u000B', '\u000C', '\u000E', '\u000F', '\u0010', '\u0011', '\u0012', '\u0013',
|
||||
'\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001A', '\u001B', '\u001C',
|
||||
'\u001D', '\u001E', '\u001F', '\u007F',
|
||||
'\u2065',
|
||||
)
|
||||
|
||||
val UnicodeCharsToStripRegex: Regex = unicodeCharsToStrip.mkString("[", "", "]").r
|
||||
|
||||
/**
|
||||
* Strips out control characters and other non-textual unicode chars that can break xml and/or
|
||||
* json rendering, or be used for exploits.
|
||||
*/
|
||||
def stripControlCharacters(text: String): String =
|
||||
UnicodeCharsToStripRegex.replaceAllIn(text, "")
|
||||
|
||||
val Tweetypie674UnicodeSequence: String =
|
||||
"\u0633\u0645\u064e\u0640\u064e\u0651\u0648\u064f\u0648\u064f\u062d\u062e " +
|
||||
"\u0337\u0334\u0310\u062e \u0337\u0334\u0310\u062e \u0337\u0334\u0310\u062e " +
|
||||
"\u0627\u0645\u0627\u0631\u062a\u064a\u062e \u0337\u0334\u0310\u062e"
|
||||
|
||||
val Tweetypie674UnicodeRegex: Regex = Tweetypie674UnicodeSequence.r
|
||||
|
||||
/**
|
||||
* Replace each `Tweetypie674UnicodeSequence` of this string to REPLACEMENT
|
||||
* CHARACTER.
|
||||
*
|
||||
* Apple has a bug in its CoreText library. This aims to prevent
|
||||
* ios clients from being crashed when a tweet contains the specific
|
||||
* unicode sequence.
|
||||
*/
|
||||
def avoidCoreTextBug(text: String): String =
|
||||
Tweetypie674UnicodeRegex.replaceAllIn(text, "\ufffd")
|
||||
|
||||
/**
|
||||
* Replace each `Tweetypie674UnicodeSequence` of this string to a REPLACEMENT
|
||||
* CHARACTER, returns a TextModification object that provides information
|
||||
* to also update entity indices.
|
||||
*/
|
||||
def replaceCoreTextBugModification(text: String): Option[TextModification] =
|
||||
replaceAll(text, Tweetypie674UnicodeRegex, "\ufffd")
|
||||
|
||||
private val preprocessor: String => String =
|
||||
((s: String) => nfcNormalize(s))
|
||||
.andThen(stripControlCharacters _)
|
||||
.andThen(trimBlankCharacters _)
|
||||
.andThen(normalizeNewlines _)
|
||||
.andThen(collapseBlankLines _)
|
||||
.andThen(avoidCoreTextBug _)
|
||||
|
||||
/**
|
||||
* Performs the text modifications that are necessary in the write-path before extracting URLs.
|
||||
*/
|
||||
def preprocessText(text: String): String =
|
||||
preprocessor(text)
|
||||
|
||||
/**
|
||||
* Replaces all `<`, `>`, and '&' chars with "<", ">", and "&", respectively.
|
||||
*
|
||||
* The original purpose of this was presumably to prevent script injections when
|
||||
* displaying tweets without proper escaping. Currently, tweets are encoded before
|
||||
* they are stored in the database.
|
||||
*
|
||||
* Note that the pre-escaping of & < and > also happens in the rich text editor in javascript
|
||||
*/
|
||||
def partialHtmlEncode(text: String): String =
|
||||
PartialHtmlEncoding.encode(text)
|
||||
|
||||
/**
|
||||
* The opposite of partialHtmlEncode, it replaces all "<", ">", and "&" with
|
||||
* `<`, `>`, and '&', respectively.
|
||||
*/
|
||||
def partialHtmlDecode(text: String): String =
|
||||
PartialHtmlEncoding.decode(text)
|
||||
|
||||
/**
|
||||
*
|
||||
* Detects all forms of whitespace, considering as whitespace the following:
|
||||
* This regex detects characters that always or often are rendered as blank space. We use
|
||||
* this to prevent users from inserting excess blank lines and from tweeting effectively
|
||||
* blank tweets.
|
||||
*
|
||||
* Note that these are not all semantically "whitespace", so this regex should not be used
|
||||
* to process non-blank text, e.g. to separate words.
|
||||
*
|
||||
* Codepoints below and the `\p{Z}` regex character property alias are defined in the Unicode
|
||||
* Character Database (UCD) at https://unicode.org/ucd/ and https://unicode.org/reports/tr44/
|
||||
*
|
||||
* The `\p{Z}` regex character property alias is defined specifically in UCD as:
|
||||
*
|
||||
* Zs | Space_Separator | a space character (of various non-zero widths)
|
||||
* Zl | Line_Separator | U+2028 LINE SEPARATOR only
|
||||
* Zp | Paragraph_Separator | U+2029 PARAGRAPH SEPARATOR only
|
||||
* Z | Separator | Zs | Zl | Zp
|
||||
* ref: https://unicode.org/reports/tr44/#GC_Values_Table
|
||||
*
|
||||
* U+0009 Horizontal Tab (included in \s)
|
||||
* U+000B Vertical Tab (included in \s)
|
||||
* U+000C Form feed (included in \s)
|
||||
* U+000D Carriage return (included in \s)
|
||||
* U+0020 space (included in \s)
|
||||
* U+0085 Next line (included in \u0085)
|
||||
* U+061C arabic letter mark (included in \u061C)
|
||||
* U+00A0 no-break space (included in \p{Z})
|
||||
* U+00AD soft-hyphen marker (included in \u00AD)
|
||||
* U+1680 ogham space mark (included in \p{Z})
|
||||
* U+180E mongolian vowel separator (included in \p{Z} on jdk8 and included in \u180E on jdk11)
|
||||
* U+2000 en quad (included in \p{Z})
|
||||
* U+2001 em quad (included in \p{Z})
|
||||
* U+2002 en space (included in \p{Z})
|
||||
* U+2003 em space (included in \p{Z})
|
||||
* U+2004 three-per-em space (included in \p{Z})
|
||||
* U+2005 four-per-em space (included in \p{Z})
|
||||
* U+2006 six-per-em space (included in \p{Z})
|
||||
* U+2007 figure space (included in \p{Z})
|
||||
* U+2008 punctuation space (included in \p{Z})
|
||||
* U+2009 thin space (included in \p{Z})
|
||||
* U+200A hair space (included in \p{Z})
|
||||
* U+200B zero-width (included in \u200B-\u200D)
|
||||
* U+200C zero-width non-joiner (included in \u200B-\u200D)
|
||||
* U+200D zero-width joiner (included in \u200B-\u200D)
|
||||
* U+2028 line separator (included in \p{Z})
|
||||
* U+2029 paragraph separator (included in \p{Z})
|
||||
* U+202F narrow no-break space (included in \p{Z})
|
||||
* U+205F medium mathematical space (included in \p{Z})
|
||||
* U+2061 function application (included in \u2061-\u2064)
|
||||
* U+2062 invisible times (included in \u2061-\u2064)
|
||||
* U+2063 invisible separator (included in \u2061-\u2064)
|
||||
* U+2064 invisible plus (included in \u2061-\u2064)
|
||||
* U+2066 left-to-right isolate (included in \u2066-\u2069)
|
||||
* U+2067 right-to-left isolate (included in \u2066-\u2069)
|
||||
* U+2068 first strong isolate (included in \u2066-\u2069)
|
||||
* U+2069 pop directional isolate (included in \u2066-\u2069)
|
||||
* U+206A inhibit symmetric swapping (included in \u206A-\u206F)
|
||||
* U+206B activate symmetric swapping (included in \u206A-\u206F)
|
||||
* U+206C inhibit arabic form shaping (included in \u206A-\u206F)
|
||||
* U+206D activate arabic form shaping (included in \u206A-\u206F)
|
||||
* U+206E national digit shapes (included in \u206A-\u206F)
|
||||
* U+206F nominal digit shapes (included in \u206A-\u206F)
|
||||
* U+2800 braille pattern blank (included in \u2800)
|
||||
* U+3164 hongul filler (see UCD Ignorable_Code_Point)
|
||||
* U+FFA0 halfwidth hongul filler (see UCD Ignorable_Code_Point)
|
||||
* U+3000 ideographic space (included in \p{Z})
|
||||
* U+FEFF zero-width no-break space (explicitly included in \uFEFF)
|
||||
*/
|
||||
val BlankTextRegex: Regex =
|
||||
"""[\s\p{Z}\u180E\u0085\u00AD\u061C\u200B-\u200D\u2061-\u2064\u2066-\u2069\u206A-\u206F\u2800\u3164\uFEFF\uFFA0]*""".r
|
||||
|
||||
/**
|
||||
* Some of the above blank characters are valid at the start of a Tweet (and irrelevant at the end)
|
||||
* such as characters that change the direction of text. When trimming from the start
|
||||
* or end of text we use a smaller set of characters
|
||||
*/
|
||||
val BlankWhenLeadingOrTrailingRegex: Regex = """[\s\p{Z}\u180E\u0085\u200B\uFEFF]*""".r
|
||||
|
||||
/**
|
||||
* Matches consecutive blanks, starting at a newline.
|
||||
*/
|
||||
val ConsecutiveBlankLinesRegex: Regex = ("""\n(""" + BlankTextRegex + """\n){2,}""").r
|
||||
|
||||
val LeadingBlankCharactersRegex: Regex = ("^" + BlankWhenLeadingOrTrailingRegex).r
|
||||
val TrailingBlankCharactersRegex: Regex = (BlankWhenLeadingOrTrailingRegex + "$").r
|
||||
|
||||
/**
|
||||
* Is the given text empty or contains nothing but whitespace?
|
||||
*/
|
||||
def isBlank(text: String): Boolean =
|
||||
BlankTextRegex.pattern.matcher(text).matches()
|
||||
|
||||
/**
|
||||
* See http://confluence.local.twitter.com/display/PROD/Displaying+line+breaks+in+Tweets
|
||||
*
|
||||
* Collapses consecutive blanks lines down to a single blank line. We can assume that
|
||||
* all newlines have already been normalized to just \n, so we don't have to worry about
|
||||
* \r\n.
|
||||
*/
|
||||
def collapseBlankLinesModification(text: String): Option[TextModification] =
|
||||
replaceAll(text, ConsecutiveBlankLinesRegex, "\n\n")
|
||||
|
||||
def collapseBlankLines(text: String): String =
|
||||
ConsecutiveBlankLinesRegex.replaceAllIn(text, "\n\n")
|
||||
|
||||
def trimBlankCharacters(text: String): String =
|
||||
TrailingBlankCharactersRegex.replaceFirstIn(
|
||||
LeadingBlankCharactersRegex.replaceFirstIn(text, ""),
|
||||
""
|
||||
)
|
||||
|
||||
/** Characters that are not visible on their own. Some of these are used in combination with
|
||||
* other visible characters, and therefore cannot be always stripped from tweets.
|
||||
*/
|
||||
private[tweettext] val InvisibleCharacters: Seq[Char] =
|
||||
Seq(
|
||||
'\u2060', '\u2061', '\u2062', '\u2063', '\u2064', '\u206A', '\u206B', '\u206C', '\u206D',
|
||||
'\u206D', '\u206E', '\u206F', '\u200C',
|
||||
'\u200D', // non-printing chars with valid use in Arabic
|
||||
'\u2009', '\u200A', '\u200B', // include very skinny spaces too
|
||||
'\ufe00', '\ufe01', '\ufe02', '\ufe03', '\ufe04', '\ufe05', '\ufe06', '\ufe07', '\ufe08',
|
||||
'\ufe09', '\ufe0A', '\ufe0B', '\ufe0C', '\ufe0D', '\ufe0E', '\ufe0F',
|
||||
)
|
||||
|
||||
private[tweetypie] val InvisibleUnicodePattern: Regex =
|
||||
("^[" + InvisibleCharacters.mkString + "]+$").r
|
||||
|
||||
def isInvisibleChar(input: Char): Boolean = {
|
||||
InvisibleCharacters contains input
|
||||
}
|
||||
|
||||
/** If string is only "invisible characters", replace full string with whitespace.
|
||||
* The purpose of this method is to remove invisible characters when ONLY invisible characters
|
||||
* appear between two urls, which can be a security vulnerability due to misleading behavior. These
|
||||
* characters cannot be removed as a rule applied to the tweet, because they are used in
|
||||
* conjuction with other characters.
|
||||
*/
|
||||
def replaceInvisiblesWithWhitespace(text: String): String = {
|
||||
text match {
|
||||
case invisible @ InvisibleUnicodePattern() => " " * TweetText.codePointLength(invisible)
|
||||
case other => other
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
package com.twitter.tweetypie.tweettext
|
||||
|
||||
/**
|
||||
* A type class for entities found within a piece of tweet text.
|
||||
*/
|
||||
trait TextEntity[T] {
|
||||
def fromIndex(entity: T): Short
|
||||
def toIndex(entity: T): Short
|
||||
def move(entity: T, fromIndex: Short, toIndex: Short): T
|
||||
}
|
||||
|
||||
object TextEntity {
|
||||
def fromIndex[T: TextEntity](entity: T): Short =
|
||||
implicitly[TextEntity[T]].fromIndex(entity)
|
||||
|
||||
def toIndex[T: TextEntity](entity: T): Short =
|
||||
implicitly[TextEntity[T]].toIndex(entity)
|
||||
|
||||
def move[T: TextEntity](entity: T, fromIndex: Short, toIndex: Short): T =
|
||||
implicitly[TextEntity[T]].move(entity, fromIndex, toIndex)
|
||||
|
||||
def shift[T: TextEntity](entity: T, offset: Short): T =
|
||||
move(entity, (fromIndex(entity) + offset).toShort, (toIndex(entity) + offset).toShort)
|
||||
}
|
|
@ -0,0 +1,232 @@
|
|||
package com.twitter.tweetypie.tweettext
|
||||
|
||||
import scala.util.matching.Regex
|
||||
|
||||
object TextModification {
|
||||
|
||||
/**
|
||||
* Lift a text into a TextModification where `original` and `updated` text are the same
|
||||
* and `replacements` is empty.
|
||||
*/
|
||||
def identity(text: String): TextModification =
|
||||
TextModification(original = text, updated = text, replacements = Nil)
|
||||
|
||||
/**
|
||||
* Replace each substring that matches the regex with the substitution string, returns a
|
||||
* TextModification object that contains the updated text and enough information to also
|
||||
* update entity indices.
|
||||
*
|
||||
* This method should correctly be taking into account surrogate-pairs. The returned
|
||||
* TextModification object has code-point offsets, instead of code-unit offsets.
|
||||
*/
|
||||
def replaceAll(text: String, regex: Regex, substitution: String): Option[TextModification] =
|
||||
replaceAll(text, regex -> substitution)
|
||||
|
||||
/**
|
||||
* Replaces substrings that match the given `Regex` with the corresonding substitution
|
||||
* string. Returns a `TextModification` that can be used to reindex entities.
|
||||
*/
|
||||
def replaceAll(
|
||||
text: String,
|
||||
regexAndSubstitutions: (Regex, String)*
|
||||
): Option[TextModification] = {
|
||||
val matches =
|
||||
(for {
|
||||
(r, s) <- regexAndSubstitutions
|
||||
m <- r.findAllIn(text).matchData
|
||||
} yield (m, s)).sortBy { case (m, _) => m.start }
|
||||
|
||||
if (matches.isEmpty) {
|
||||
// no match found, return None to indicate no modifications made
|
||||
None
|
||||
} else {
|
||||
val replacements = List.newBuilder[TextReplacement]
|
||||
val indexConverter = new IndexConverter(text)
|
||||
// contains the retained text, built up as we walk through the regex matches
|
||||
val buf = new StringBuilder(text.length)
|
||||
// the number of code-points copied into buf
|
||||
var codePointsCopied = Offset.CodePoint(0)
|
||||
// always holds the start code-unit offset to copy to buf when we encounter
|
||||
// either a regex match or end-of-string.
|
||||
var anchor = 0
|
||||
|
||||
import indexConverter.toCodePoints
|
||||
|
||||
for ((m, sub) <- matches) {
|
||||
val unchangedText = text.substring(anchor, m.start)
|
||||
val unchangedLen = Offset.CodePoint.length(unchangedText)
|
||||
val subLen = Offset.CodePoint.length(sub)
|
||||
|
||||
// copies the text upto the regex match run, plus the replacement string
|
||||
buf.append(unchangedText).append(sub)
|
||||
codePointsCopied += unchangedLen + subLen
|
||||
|
||||
// the offsets indicate the indices of the matched string in the original
|
||||
// text, and the indices of the replacement string in the updated string
|
||||
replacements +=
|
||||
TextReplacement(
|
||||
originalFrom = toCodePoints(Offset.CodeUnit(m.start)),
|
||||
originalTo = toCodePoints(Offset.CodeUnit(m.end)),
|
||||
updatedFrom = codePointsCopied - subLen,
|
||||
updatedTo = codePointsCopied
|
||||
)
|
||||
|
||||
anchor = m.end
|
||||
}
|
||||
|
||||
buf.append(text.substring(anchor))
|
||||
|
||||
Some(TextModification(text, buf.toString, replacements.result()))
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts a string at a specified code point offset.
|
||||
* Returns a `TextModification` that can be used to reindex entities.
|
||||
*/
|
||||
def insertAt(
|
||||
originalText: String,
|
||||
insertAt: Offset.CodePoint,
|
||||
textToInsert: String
|
||||
): TextModification = {
|
||||
val insertAtCodeUnit = insertAt.toCodeUnit(originalText).toInt
|
||||
val (before, after) = originalText.splitAt(insertAtCodeUnit)
|
||||
val updatedText = s"$before$textToInsert$after"
|
||||
val textToInsertLength = TweetText.codePointLength(textToInsert)
|
||||
|
||||
TextModification(
|
||||
original = originalText,
|
||||
updated = updatedText,
|
||||
replacements = List(
|
||||
TextReplacement.fromCodePoints(
|
||||
originalFrom = insertAt.toInt,
|
||||
originalTo = insertAt.toInt,
|
||||
updatedFrom = insertAt.toInt,
|
||||
updatedTo = insertAt.toInt + textToInsertLength
|
||||
))
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Encodes information about insertions/deletions/replacements made to a string, providing
|
||||
* the original string, the updated string, and a list of TextReplacement objects
|
||||
* that encode the indices of the segments that were changed. Using this information,
|
||||
* it is possible to map an offset into the original string to an offset into the updated
|
||||
* string, assuming the text at the offset was not within one of the modified segments.
|
||||
*
|
||||
* All offsets are code-points, not UTF6 code-units.
|
||||
*/
|
||||
case class TextModification(
|
||||
original: String,
|
||||
updated: String,
|
||||
replacements: List[TextReplacement]) {
|
||||
private val originalLen = Offset.CodePoint.length(original)
|
||||
|
||||
/**
|
||||
* Using an offset into the original String, computes the equivalent offset into the updated
|
||||
* string. If the offset falls within a segment that was removed/replaced, None is returned.
|
||||
*/
|
||||
def reindex(index: Offset.CodePoint): Option[Offset.CodePoint] =
|
||||
reindex(index, Offset.CodePoint(0), replacements)
|
||||
|
||||
/**
|
||||
* Reindexes an entity of type T. Returns the updated entity, or None if either the `fromIndex`
|
||||
* or `toIndex` value is now out of range.
|
||||
*/
|
||||
def reindexEntity[T: TextEntity](e: T): Option[T] =
|
||||
for {
|
||||
from <- reindex(Offset.CodePoint(TextEntity.fromIndex(e)))
|
||||
to <- reindex(Offset.CodePoint(TextEntity.toIndex(e) - 1))
|
||||
} yield TextEntity.move(e, from.toShort, (to.toShort + 1).toShort)
|
||||
|
||||
/**
|
||||
* Reindexes a sequence of entities of type T. Some entities could be filtered
|
||||
* out if they span a region of text that has been removed.
|
||||
*/
|
||||
def reindexEntities[T: TextEntity](es: Seq[T]): Seq[T] =
|
||||
for (e <- es; e2 <- reindexEntity(e)) yield e2
|
||||
|
||||
/**
|
||||
* Swaps `original` and `updated` text and inverts all `TextReplacement` instances.
|
||||
*/
|
||||
def inverse: TextModification =
|
||||
TextModification(updated, original, replacements.map(_.inverse))
|
||||
|
||||
// recursively walks through the list of TextReplacement objects computing
|
||||
// offsets to add/substract from 'shift', which accumulates all changes and
|
||||
// then gets added to index at the end.
|
||||
private def reindex(
|
||||
index: Offset.CodePoint,
|
||||
shift: Offset.CodePoint,
|
||||
reps: List[TextReplacement]
|
||||
): Option[Offset.CodePoint] =
|
||||
reps match {
|
||||
case Nil =>
|
||||
if (index.toInt >= 0 && index <= originalLen)
|
||||
Some(index + shift)
|
||||
else
|
||||
None
|
||||
case (r @ TextReplacement(fr, to, _, _)) :: tail =>
|
||||
if (index < fr) Some(index + shift)
|
||||
else if (index < to) None
|
||||
else reindex(index, shift + r.lengthDelta, tail)
|
||||
}
|
||||
}
|
||||
|
||||
object TextReplacement {
|
||||
def fromCodePoints(
|
||||
originalFrom: Int,
|
||||
originalTo: Int,
|
||||
updatedFrom: Int,
|
||||
updatedTo: Int
|
||||
): TextReplacement =
|
||||
TextReplacement(
|
||||
Offset.CodePoint(originalFrom),
|
||||
Offset.CodePoint(originalTo),
|
||||
Offset.CodePoint(updatedFrom),
|
||||
Offset.CodePoint(updatedTo)
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Encodes the indices of a segment of text in one string that maps to a replacement
|
||||
* segment in an updated version of the text. The replacement segment could be empty
|
||||
* (updatedTo == updatedFrom), indicating the segment was removed.
|
||||
*
|
||||
* All offsets are code-points, not UTF16 code-units.
|
||||
*
|
||||
* `originalFrom` and `updatedFrom` are inclusive.
|
||||
* `originalTo` and `updatedTo` are exclusive.
|
||||
*/
|
||||
case class TextReplacement(
|
||||
originalFrom: Offset.CodePoint,
|
||||
originalTo: Offset.CodePoint,
|
||||
updatedFrom: Offset.CodePoint,
|
||||
updatedTo: Offset.CodePoint) {
|
||||
def originalLength: Offset.CodePoint = originalTo - originalFrom
|
||||
def updatedLength: Offset.CodePoint = updatedTo - updatedFrom
|
||||
def lengthDelta: Offset.CodePoint = updatedLength - originalLength
|
||||
|
||||
def shiftOriginal(offset: Offset.CodePoint): TextReplacement =
|
||||
copy(originalFrom = originalFrom + offset, originalTo = originalTo + offset)
|
||||
|
||||
def shiftUpdated(offset: Offset.CodePoint): TextReplacement =
|
||||
copy(updatedFrom = updatedFrom + offset, updatedTo = updatedTo + offset)
|
||||
|
||||
def shift(offset: Offset.CodePoint): TextReplacement =
|
||||
TextReplacement(
|
||||
originalFrom + offset,
|
||||
originalTo + offset,
|
||||
updatedFrom + offset,
|
||||
updatedTo + offset
|
||||
)
|
||||
|
||||
def inverse: TextReplacement =
|
||||
TextReplacement(
|
||||
originalFrom = updatedFrom,
|
||||
originalTo = updatedTo,
|
||||
updatedFrom = originalFrom,
|
||||
updatedTo = originalTo
|
||||
)
|
||||
}
|
|
@ -0,0 +1,159 @@
|
|||
package com.twitter.tweetypie.tweettext
|
||||
|
||||
import com.twitter.tweetypie.tweettext.TweetText._
|
||||
import com.twitter.twittertext.Extractor
|
||||
import java.lang.Character
|
||||
import scala.annotation.tailrec
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object Truncator {
|
||||
val Ellipsis = "\u2026"
|
||||
|
||||
/**
|
||||
* Truncate tweet text for a retweet. If the text is longer than
|
||||
* either of the length limits, code points are cut off from the end
|
||||
* of the text and replaced with an ellipsis. We keep as much of the
|
||||
* leading text as possible, subject to these constraints:
|
||||
*
|
||||
* - There are no more than `MaxDisplayLength` characters.
|
||||
*
|
||||
* - When converted to UTF-8, the result does not exceed `MaxByteLength`.
|
||||
*
|
||||
* - We do not break within a single grapheme cluster.
|
||||
*
|
||||
* The input is assumed to be partial HTML-encoded and may or may
|
||||
* not be NFC normalized. The result will be partial HTML-encoded
|
||||
* and will be NFC normalized.
|
||||
*/
|
||||
def truncateForRetweet(input: String): String = truncateWithEllipsis(input, Ellipsis)
|
||||
|
||||
/**
|
||||
* Truncate to [[com.twitter.tweetypie.tweettext.TweetText#OrginalMaxDisplayLength]] display
|
||||
* units, using "..." as an ellipsis. The resulting text is guaranteed to pass our tweet length
|
||||
* check, but it is not guaranteed to fit in a SMS message.
|
||||
*/
|
||||
def truncateForSms(input: String): String = truncateWithEllipsis(input, "...")
|
||||
|
||||
/**
|
||||
* Check the length of the given text, and truncate it if it is longer
|
||||
* than the allowed length for a Tweet. The result of this method will
|
||||
* always have:
|
||||
*
|
||||
* - Display length <= OriginalMaxDisplayLength.
|
||||
* - Length when encoded as UTF-8 <= OriginalMaxUtf8Length.
|
||||
*
|
||||
* If the input would violate this, then the text will be
|
||||
* truncated. When the text is truncated, it will be truncated such
|
||||
* that:
|
||||
*
|
||||
* - Grapheme clusters will not be split.
|
||||
* - The last character before the ellipsis will not be a whitespace
|
||||
* character.
|
||||
* - The ellipsis text will be appended to the end.
|
||||
*/
|
||||
private[this] def truncateWithEllipsis(input: String, ellipsis: String): String = {
|
||||
val text = nfcNormalize(input)
|
||||
val truncateAt =
|
||||
truncationPoint(text, OriginalMaxDisplayLength, OriginalMaxUtf8Length, Some(ellipsis))
|
||||
if (truncateAt.codeUnitOffset.toInt == text.length) text
|
||||
else text.take(truncateAt.codeUnitOffset.toInt) + ellipsis
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicates a potential TruncationPoint in piece of text.
|
||||
*
|
||||
* @param charOffset the utf-16 character offset of the truncation point
|
||||
* @param codePointOffset the offset in code points
|
||||
*/
|
||||
case class TruncationPoint(codeUnitOffset: Offset.CodeUnit, codePointOffset: Offset.CodePoint)
|
||||
|
||||
/**
|
||||
* Computes a TruncationPoint for the given text and length constraints. If `truncated` on
|
||||
* the result is `false`, it means the text will fit within the given constraints without
|
||||
* truncation. Otherwise, the result indicates both the character and code-point offsets
|
||||
* at which to perform the truncation, and the resulting display length and byte length of
|
||||
* the truncated string.
|
||||
*
|
||||
* Text should be NFC normalized first for best results.
|
||||
*
|
||||
* @param withEllipsis if true, then the truncation point will be computed so that there is space
|
||||
* to append an ellipsis and to still remain within the limits. The ellipsis is not counted
|
||||
* in the returned display and byte lengths.
|
||||
*
|
||||
* @param atomicUnits may contain a list of ranges that should be treated as atomic unit and
|
||||
* not split. each tuple is half-open range in code points.
|
||||
*/
|
||||
def truncationPoint(
|
||||
text: String,
|
||||
maxDisplayLength: Int = OriginalMaxDisplayLength,
|
||||
maxByteLength: Int = OriginalMaxUtf8Length,
|
||||
withEllipsis: Option[String] = None,
|
||||
atomicUnits: Offset.Ranges[Offset.CodePoint] = Offset.Ranges.Empty
|
||||
): TruncationPoint = {
|
||||
val breakPoints =
|
||||
GraphemeIndexIterator
|
||||
.ends(text)
|
||||
.filterNot(Offset.Ranges.htmlEntities(text).contains)
|
||||
|
||||
val ellipsisDisplayUnits =
|
||||
withEllipsis.map(Offset.DisplayUnit.length).getOrElse(Offset.DisplayUnit(0))
|
||||
val maxTruncatedDisplayLength = Offset.DisplayUnit(maxDisplayLength) - ellipsisDisplayUnits
|
||||
|
||||
val ellipsisByteLength = withEllipsis.map(Offset.Utf8.length).getOrElse(Offset.Utf8(0))
|
||||
val maxTruncatedByteLength = Offset.Utf8(maxByteLength) - ellipsisByteLength
|
||||
|
||||
var codeUnit = Offset.CodeUnit(0)
|
||||
var codePoint = Offset.CodePoint(0)
|
||||
var displayLength = Offset.DisplayUnit(0)
|
||||
var byteLength = Offset.Utf8(0)
|
||||
var truncateCodeUnit = codeUnit
|
||||
var truncateCodePoint = codePoint
|
||||
|
||||
@tailrec def go(): TruncationPoint =
|
||||
if (displayLength.toInt > maxDisplayLength || byteLength.toInt > maxByteLength) {
|
||||
TruncationPoint(truncateCodeUnit, truncateCodePoint)
|
||||
} else if (codeUnit != truncateCodeUnit &&
|
||||
displayLength <= maxTruncatedDisplayLength &&
|
||||
byteLength <= maxTruncatedByteLength &&
|
||||
(codeUnit.toInt == 0 || !Character.isWhitespace(text.codePointBefore(codeUnit.toInt))) &&
|
||||
!atomicUnits.contains(codePoint)) {
|
||||
// we can advance the truncation point
|
||||
truncateCodeUnit = codeUnit
|
||||
truncateCodePoint = codePoint
|
||||
go()
|
||||
} else if (breakPoints.hasNext) {
|
||||
// there are further truncation points to consider
|
||||
val nextCodeUnit = breakPoints.next
|
||||
codePoint += Offset.CodePoint.count(text, codeUnit, nextCodeUnit)
|
||||
displayLength += Offset.DisplayUnit.count(text, codeUnit, nextCodeUnit)
|
||||
byteLength += Offset.Utf8.count(text, codeUnit, nextCodeUnit)
|
||||
codeUnit = nextCodeUnit
|
||||
go()
|
||||
} else {
|
||||
TruncationPoint(codeUnit, codePoint)
|
||||
}
|
||||
|
||||
go()
|
||||
}
|
||||
|
||||
/**
|
||||
* Truncate the given text, avoiding chopping HTML entities and tweet
|
||||
* entities. This should only be used for testing because it performs
|
||||
* entity extraction, and so is very inefficient.
|
||||
*/
|
||||
def truncateForTests(
|
||||
input: String,
|
||||
maxDisplayLength: Int = OriginalMaxDisplayLength,
|
||||
maxByteLength: Int = OriginalMaxUtf8Length
|
||||
): String = {
|
||||
val text = nfcNormalize(input)
|
||||
val extractor = new Extractor
|
||||
val entities = extractor.extractEntitiesWithIndices(text)
|
||||
extractor.modifyIndicesFromUTF16ToUnicode(text, entities)
|
||||
val avoid = Offset.Ranges.fromCodePointPairs(
|
||||
entities.asScala.map(e => (e.getStart().intValue, e.getEnd().intValue))
|
||||
)
|
||||
val truncateAt = truncationPoint(text, maxDisplayLength, maxByteLength, None, avoid)
|
||||
text.take(truncateAt.codeUnitOffset.toInt)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
package com.twitter.tweetypie.tweettext
|
||||
|
||||
import java.text.Normalizer
|
||||
|
||||
object TweetText {
|
||||
|
||||
/** The original maximum tweet length, taking into account normalization */
|
||||
private[tweetypie] val OriginalMaxDisplayLength = 140
|
||||
|
||||
/** Maximum number of visible code points allowed in a tweet when tweet length is counted by code
|
||||
* points, taking into account normalization. See also [[MaxVisibleWeightedEmojiLength]].
|
||||
*/
|
||||
private[tweetypie] val MaxVisibleWeightedLength = 280
|
||||
|
||||
/** Maximum number of visible code points allowed in a tweet when tweet length is counted by
|
||||
* emoji, taking into account normalization. See also [[MaxVisibleWeightedLength]].
|
||||
* 140 is the max number of Emojis, visible, fully-weighted per Twitter's cramming rules
|
||||
* 10 is the max number of Code Points per Emoji
|
||||
*/
|
||||
private[tweetypie] val MaxVisibleWeightedEmojiLength = 140 * 10
|
||||
|
||||
/** Maximum number of bytes when truncating tweet text for a retweet. Originally was the
|
||||
* max UTF-8 length when tweets were at most 140 characters.
|
||||
* See also [[OriginalMaxDisplayLength]].
|
||||
*/
|
||||
private[tweetypie] val OriginalMaxUtf8Length = 600
|
||||
|
||||
/** Maximum number of bytes for tweet text using utf-8 encoding.
|
||||
*/
|
||||
private[tweetypie] val MaxUtf8Length = 5708
|
||||
|
||||
/** Maximum number of mentions allowed in tweet text. This is enforced at tweet creation time */
|
||||
private[tweetypie] val MaxMentions = 50
|
||||
|
||||
/** Maximum number of urls allowed in tweet text. This is enforced at tweet creation time */
|
||||
private[tweetypie] val MaxUrls = 10
|
||||
|
||||
/** Maximum number of hashtags allowed in tweet text. This is enforced at tweet creation time */
|
||||
private[tweetypie] val MaxHashtags = 50
|
||||
|
||||
/** Maximum number of cashtags allowed in tweet text. This is enforced at tweet creation time */
|
||||
private[tweetypie] val MaxCashtags = 50
|
||||
|
||||
/** Maximum length of a hashtag (not including the '#') */
|
||||
private[tweetypie] val MaxHashtagLength = 100
|
||||
|
||||
/**
|
||||
* Normalizes the text according to the unicode NFC spec.
|
||||
*/
|
||||
def nfcNormalize(text: String): String = Normalizer.normalize(text, Normalizer.Form.NFC)
|
||||
|
||||
/**
|
||||
* Return the number of "characters" in this text. See
|
||||
* [[Offset.DisplayUnit]].
|
||||
*/
|
||||
def displayLength(text: String): Int = Offset.DisplayUnit.length(text).toInt
|
||||
|
||||
/**
|
||||
* Return the number of Unicode code points in this String.
|
||||
*/
|
||||
def codePointLength(text: String): Int = Offset.CodePoint.length(text).toInt
|
||||
}
|
76
tweetypie/common/src/scala/com/twitter/tweetypie/util/BUILD
Normal file
76
tweetypie/common/src/scala/com/twitter/tweetypie/util/BUILD
Normal file
|
@ -0,0 +1,76 @@
|
|||
scala_library(
|
||||
sources = ["*.scala"],
|
||||
compiler_option_sets = ["fatal_warnings"],
|
||||
platform = "java8",
|
||||
provides = scala_artifact(
|
||||
org = "com.twitter.tweetypie",
|
||||
name = "util",
|
||||
repo = artifactory,
|
||||
),
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"//:scala-reflect",
|
||||
"3rdparty/jvm/commons-codec",
|
||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
||||
"finagle/finagle-core/src/main",
|
||||
"mediaservices/commons/src/main/thrift:thrift-scala",
|
||||
"scrooge/scrooge-serializer/src/main/scala",
|
||||
"tweetypie/servo/repo",
|
||||
"tweetypie/servo/util",
|
||||
"tweetypie/servo/util/src/main/scala:exception",
|
||||
"src/scala/com/twitter/takedown/util",
|
||||
"src/thrift/com/twitter/dataproducts:enrichments_profilegeo-scala",
|
||||
"src/thrift/com/twitter/escherbird:media-annotation-structs-scala",
|
||||
"src/thrift/com/twitter/expandodo:cards-scala",
|
||||
"src/thrift/com/twitter/gizmoduck:thrift-scala",
|
||||
"src/thrift/com/twitter/servo:servo-exception-scala",
|
||||
"src/thrift/com/twitter/spam/rtf:safety-label-scala",
|
||||
"tweetypie/common/src/thrift/com/twitter/tweetypie:deprecated-scala",
|
||||
"tweetypie/common/src/thrift/com/twitter/tweetypie:service-scala",
|
||||
"tweetypie/common/src/thrift/com/twitter/tweetypie:transient_context-scala",
|
||||
"tweetypie/common/src/thrift/com/twitter/tweetypie:tweet-scala",
|
||||
"stitch/stitch-core",
|
||||
"tweet-util",
|
||||
"util/util-core:scala",
|
||||
],
|
||||
)
|
||||
|
||||
scala_library(
|
||||
name = "EditControlUtil",
|
||||
sources = [
|
||||
"EditControlUtil.scala",
|
||||
"package.scala",
|
||||
],
|
||||
compiler_option_sets = ["fatal_warnings"],
|
||||
platform = "java8",
|
||||
provides = scala_artifact(
|
||||
org = "com.twitter.tweetypie",
|
||||
name = "util-EditControlUtil",
|
||||
repo = artifactory,
|
||||
),
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"//:scala-reflect",
|
||||
"3rdparty/jvm/commons-codec",
|
||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
||||
"finagle/finagle-core/src/main",
|
||||
"mediaservices/commons/src/main/thrift:thrift-scala",
|
||||
"scrooge/scrooge-serializer/src/main/scala",
|
||||
"tweetypie/servo/util/src/main/scala:exception",
|
||||
"src/thrift/com/twitter/dataproducts:enrichments_profilegeo-scala",
|
||||
"src/thrift/com/twitter/escherbird:media-annotation-structs-scala",
|
||||
"src/thrift/com/twitter/expandodo:cards-scala",
|
||||
"src/thrift/com/twitter/gizmoduck:thrift-scala",
|
||||
"src/thrift/com/twitter/servo:servo-exception-scala",
|
||||
"src/thrift/com/twitter/spam/rtf:safety-label-scala",
|
||||
"tweetypie/common/src/thrift/com/twitter/tweetypie:deprecated-scala",
|
||||
"tweetypie/common/src/thrift/com/twitter/tweetypie:service-scala",
|
||||
"tweetypie/common/src/thrift/com/twitter/tweetypie:transient_context-scala",
|
||||
"tweetypie/common/src/thrift/com/twitter/tweetypie:tweet-scala",
|
||||
"stitch/stitch-core",
|
||||
"tweet-util",
|
||||
"util/util-core:scala",
|
||||
],
|
||||
)
|
|
@ -0,0 +1,29 @@
|
|||
package com.twitter.tweetypie.util
|
||||
|
||||
import com.twitter.escherbird.thriftscala.TweetEntityAnnotation
|
||||
import com.twitter.tweetypie.thriftscala.EscherbirdEntityAnnotations
|
||||
import com.twitter.tweetypie.thriftscala.Tweet
|
||||
|
||||
object CommunityAnnotation {
|
||||
|
||||
val groupId: Long = 8
|
||||
val domainId: Long = 31
|
||||
|
||||
def apply(communityId: Long): TweetEntityAnnotation =
|
||||
TweetEntityAnnotation(groupId, domainId, entityId = communityId)
|
||||
|
||||
def unapply(annotation: TweetEntityAnnotation): Option[Long] =
|
||||
annotation match {
|
||||
case TweetEntityAnnotation(`groupId`, `domainId`, entityId) => Some(entityId)
|
||||
case _ => None
|
||||
}
|
||||
|
||||
// Returns None instead of Some(Seq()) when there are non-community annotations present
|
||||
def additionalFieldsToCommunityIDs(additionalFields: Tweet): Option[Seq[Long]] = {
|
||||
additionalFields.escherbirdEntityAnnotations
|
||||
.map {
|
||||
case EscherbirdEntityAnnotations(entityAnnotations) =>
|
||||
entityAnnotations.flatMap(CommunityAnnotation.unapply)
|
||||
}.filter(_.nonEmpty)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
package com.twitter.tweetypie.util
|
||||
|
||||
import com.twitter.tweetypie.thriftscala.Communities
|
||||
|
||||
object CommunityUtil {
|
||||
|
||||
def communityIds(maybeCommunities: Option[Communities]): Seq[Long] = {
|
||||
maybeCommunities match {
|
||||
case None =>
|
||||
Nil
|
||||
case Some(Communities(seq)) =>
|
||||
seq
|
||||
}
|
||||
}
|
||||
|
||||
def hasCommunity(maybeCommunities: Option[Communities]): Boolean = {
|
||||
maybeCommunities.exists(_.communityIds.nonEmpty)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,112 @@
|
|||
package com.twitter.tweetypie
|
||||
package util
|
||||
|
||||
import com.twitter.tweetypie.thriftscala._
|
||||
|
||||
object ConversationControls {
|
||||
object Create {
|
||||
def byInvitation(
|
||||
inviteViaMention: Option[Boolean] = None
|
||||
): TweetCreateConversationControl.ByInvitation = TweetCreateConversationControl.ByInvitation(
|
||||
TweetCreateConversationControlByInvitation(inviteViaMention = inviteViaMention)
|
||||
)
|
||||
|
||||
def community(
|
||||
inviteViaMention: Option[Boolean] = None
|
||||
): TweetCreateConversationControl.Community = TweetCreateConversationControl.Community(
|
||||
TweetCreateConversationControlCommunity(inviteViaMention = inviteViaMention)
|
||||
)
|
||||
|
||||
def followers(
|
||||
inviteViaMention: Option[Boolean] = None
|
||||
): TweetCreateConversationControl.Followers = TweetCreateConversationControl.Followers(
|
||||
TweetCreateConversationControlFollowers(inviteViaMention = inviteViaMention)
|
||||
)
|
||||
}
|
||||
|
||||
object Scenario {
|
||||
case class CommonScenario(
|
||||
createConversationControl: TweetCreateConversationControl,
|
||||
descriptionSuffix: String,
|
||||
expectedConversationControl: (UserId, Seq[UserId]) => ConversationControl,
|
||||
inviteViaMention: Option[Boolean])
|
||||
|
||||
def mkCommunityScenario(inviteViaMention: Option[Boolean]): CommonScenario =
|
||||
CommonScenario(
|
||||
Create.community(inviteViaMention = inviteViaMention),
|
||||
"community",
|
||||
expectedConversationControl = (authorId, userIds) => {
|
||||
community(userIds, authorId, inviteViaMention)
|
||||
},
|
||||
inviteViaMention
|
||||
)
|
||||
|
||||
def mkByInvitationScenario(inviteViaMention: Option[Boolean]): CommonScenario =
|
||||
CommonScenario(
|
||||
Create.byInvitation(inviteViaMention = inviteViaMention),
|
||||
"invited users",
|
||||
expectedConversationControl = (authorId, userIds) => {
|
||||
byInvitation(userIds, authorId, inviteViaMention)
|
||||
},
|
||||
inviteViaMention
|
||||
)
|
||||
|
||||
def mkFollowersScenario(inviteViaMention: Option[Boolean]): CommonScenario =
|
||||
CommonScenario(
|
||||
Create.followers(inviteViaMention = inviteViaMention),
|
||||
"followers",
|
||||
expectedConversationControl = (authorId, userIds) => {
|
||||
followers(userIds, authorId, inviteViaMention)
|
||||
},
|
||||
inviteViaMention
|
||||
)
|
||||
|
||||
val communityScenario = mkCommunityScenario(None)
|
||||
val communityInviteViaMentionScenario = mkCommunityScenario(Some(true))
|
||||
|
||||
val byInvitationScenario = mkByInvitationScenario(None)
|
||||
val byInvitationInviteViaMentionScenario = mkByInvitationScenario(Some(true))
|
||||
|
||||
val followersScenario = mkFollowersScenario(None)
|
||||
val followersInviteViaMentionScenario = mkFollowersScenario(Some(true))
|
||||
}
|
||||
|
||||
def byInvitation(
|
||||
invitedUserIds: Seq[UserId],
|
||||
conversationTweetAuthorId: UserId,
|
||||
inviteViaMention: Option[Boolean] = None
|
||||
): ConversationControl =
|
||||
ConversationControl.ByInvitation(
|
||||
ConversationControlByInvitation(
|
||||
conversationTweetAuthorId = conversationTweetAuthorId,
|
||||
invitedUserIds = invitedUserIds,
|
||||
inviteViaMention = inviteViaMention
|
||||
)
|
||||
)
|
||||
|
||||
def community(
|
||||
invitedUserIds: Seq[UserId],
|
||||
conversationTweetAuthorId: UserId,
|
||||
inviteViaMention: Option[Boolean] = None
|
||||
): ConversationControl =
|
||||
ConversationControl.Community(
|
||||
ConversationControlCommunity(
|
||||
conversationTweetAuthorId = conversationTweetAuthorId,
|
||||
invitedUserIds = invitedUserIds,
|
||||
inviteViaMention = inviteViaMention
|
||||
)
|
||||
)
|
||||
|
||||
def followers(
|
||||
invitedUserIds: Seq[UserId],
|
||||
conversationTweetAuthorId: UserId,
|
||||
inviteViaMention: Option[Boolean] = None
|
||||
): ConversationControl =
|
||||
ConversationControl.Followers(
|
||||
ConversationControlFollowers(
|
||||
conversationTweetAuthorId = conversationTweetAuthorId,
|
||||
invitedUserIds = invitedUserIds,
|
||||
inviteViaMention = inviteViaMention
|
||||
)
|
||||
)
|
||||
}
|
|
@ -0,0 +1,174 @@
|
|||
package com.twitter.tweetypie.util
|
||||
|
||||
import com.twitter.servo.util.Gate
|
||||
import com.twitter.tweetypie.util.TweetEditFailure.TweetEditInvalidEditControlException
|
||||
import com.twitter.tweetypie.util.TweetEditFailure.TweetEditUpdateEditControlException
|
||||
import com.twitter.tweetypie.thriftscala.EditControl
|
||||
import com.twitter.tweetypie.thriftscala.EditControlEdit
|
||||
import com.twitter.tweetypie.thriftscala.EditControlInitial
|
||||
import com.twitter.tweetypie.thriftscala.Tweet
|
||||
import com.twitter.util.Try
|
||||
import com.twitter.util.Return
|
||||
import com.twitter.util.Throw
|
||||
import com.twitter.util.Time
|
||||
import com.twitter.util.Duration
|
||||
|
||||
object EditControlUtil {
|
||||
|
||||
val maxTweetEditsAllowed = 5
|
||||
val oldEditTimeWindow = Duration.fromMinutes(30)
|
||||
val editTimeWindow = Duration.fromMinutes(60)
|
||||
|
||||
def editControlEdit(
|
||||
initialTweetId: TweetId,
|
||||
editControlInitial: Option[EditControlInitial] = None
|
||||
): EditControl.Edit =
|
||||
EditControl.Edit(
|
||||
EditControlEdit(initialTweetId = initialTweetId, editControlInitial = editControlInitial))
|
||||
|
||||
// EditControl for the tweet that is not an edit, that is, any regular tweet we create
|
||||
// that can, potentially, be edited later.
|
||||
def makeEditControlInitial(
|
||||
tweetId: TweetId,
|
||||
createdAt: Time,
|
||||
setEditWindowToSixtyMinutes: Gate[Unit] = Gate(_ => false)
|
||||
): EditControl.Initial = {
|
||||
val editWindow = if (setEditWindowToSixtyMinutes()) editTimeWindow else oldEditTimeWindow
|
||||
val initial = EditControlInitial(
|
||||
editTweetIds = Seq(tweetId),
|
||||
editableUntilMsecs = Some(createdAt.plus(editWindow).inMilliseconds),
|
||||
editsRemaining = Some(maxTweetEditsAllowed),
|
||||
isEditEligible = defaultIsEditEligible,
|
||||
)
|
||||
EditControl.Initial(initial)
|
||||
}
|
||||
|
||||
// Returns if a given latestTweetId is the latest edit in the EditControl
|
||||
def isLatestEdit(
|
||||
tweetEditControl: Option[EditControl],
|
||||
latestTweetId: TweetId
|
||||
): Try[Boolean] = {
|
||||
tweetEditControl match {
|
||||
case Some(EditControl.Initial(initial)) =>
|
||||
isLatestEditFromEditControlInitial(Some(initial), latestTweetId)
|
||||
case Some(EditControl.Edit(edit)) =>
|
||||
isLatestEditFromEditControlInitial(
|
||||
edit.editControlInitial,
|
||||
latestTweetId
|
||||
)
|
||||
case _ => Throw(TweetEditInvalidEditControlException)
|
||||
}
|
||||
}
|
||||
|
||||
// Returns if a given latestTweetId is the latest edit in the EditControlInitial
|
||||
private def isLatestEditFromEditControlInitial(
|
||||
initialTweetEditControl: Option[EditControlInitial],
|
||||
latestTweetId: TweetId
|
||||
): Try[Boolean] = {
|
||||
initialTweetEditControl match {
|
||||
case Some(initial) =>
|
||||
Return(latestTweetId == initial.editTweetIds.last)
|
||||
case _ => Throw(TweetEditInvalidEditControlException)
|
||||
}
|
||||
}
|
||||
|
||||
/* Create an updated edit control for an initialTweet given the id of the new edit */
|
||||
def editControlForInitialTweet(
|
||||
initialTweet: Tweet,
|
||||
newEditId: TweetId
|
||||
): Try[EditControl.Initial] = {
|
||||
initialTweet.editControl match {
|
||||
case Some(EditControl.Initial(initial)) =>
|
||||
Return(EditControl.Initial(plusEdit(initial, newEditId)))
|
||||
|
||||
case Some(EditControl.Edit(_)) => Throw(TweetEditUpdateEditControlException)
|
||||
|
||||
case _ =>
|
||||
initialTweet.coreData match {
|
||||
case Some(coreData) =>
|
||||
Return(
|
||||
makeEditControlInitial(
|
||||
tweetId = initialTweet.id,
|
||||
createdAt = Time.fromMilliseconds(coreData.createdAtSecs * 1000),
|
||||
setEditWindowToSixtyMinutes = Gate(_ => true)
|
||||
)
|
||||
)
|
||||
case None => Throw(new Exception("Tweet Missing Required CoreData"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def updateEditControl(tweet: Tweet, newEditId: TweetId): Try[Tweet] =
|
||||
editControlForInitialTweet(tweet, newEditId).map { editControl =>
|
||||
tweet.copy(editControl = Some(editControl))
|
||||
}
|
||||
|
||||
def plusEdit(initial: EditControlInitial, newEditId: TweetId): EditControlInitial = {
|
||||
val newEditTweetIds = (initial.editTweetIds :+ newEditId).distinct.sorted
|
||||
val editsCount = newEditTweetIds.size - 1 // as there is the original tweet ID there too.
|
||||
initial.copy(
|
||||
editTweetIds = newEditTweetIds,
|
||||
editsRemaining = Some(maxTweetEditsAllowed - editsCount),
|
||||
)
|
||||
}
|
||||
|
||||
// The ID of the initial Tweet if this is an edit
|
||||
def getInitialTweetIdIfEdit(tweet: Tweet): Option[TweetId] = tweet.editControl match {
|
||||
case Some(EditControl.Edit(edit)) => Some(edit.initialTweetId)
|
||||
case _ => None
|
||||
}
|
||||
|
||||
// If this is the first tweet in an edit chain, return the same tweet id
|
||||
// otherwise return the result of getInitialTweetId
|
||||
def getInitialTweetId(tweet: Tweet): TweetId =
|
||||
getInitialTweetIdIfEdit(tweet).getOrElse(tweet.id)
|
||||
|
||||
def isInitialTweet(tweet: Tweet): Boolean =
|
||||
getInitialTweetId(tweet) == tweet.id
|
||||
|
||||
// Extracted just so that we can easily track where the values of isEditEligible is coming from.
|
||||
private def defaultIsEditEligible: Option[Boolean] = Some(true)
|
||||
|
||||
// returns true if it's an edit of a Tweet or an initial Tweet that's been edited
|
||||
def isEditTweet(tweet: Tweet): Boolean =
|
||||
tweet.editControl match {
|
||||
case Some(eci: EditControl.Initial) if eci.initial.editTweetIds.size <= 1 => false
|
||||
case Some(_: EditControl.Initial) | Some(_: EditControl.Edit) | Some(
|
||||
EditControl.UnknownUnionField(_)) =>
|
||||
true
|
||||
case None => false
|
||||
}
|
||||
|
||||
// returns true if editControl is from an edit of a Tweet
|
||||
// returns false for any other state, including edit intial.
|
||||
def isEditControlEdit(editControl: EditControl): Boolean = {
|
||||
editControl match {
|
||||
case _: EditControl.Edit | EditControl.UnknownUnionField(_) => true
|
||||
case _ => false
|
||||
}
|
||||
}
|
||||
|
||||
def getEditTweetIds(editControl: Option[EditControl]): Try[Seq[TweetId]] = {
|
||||
editControl match {
|
||||
case Some(EditControl.Edit(EditControlEdit(_, Some(eci)))) =>
|
||||
Return(eci.editTweetIds)
|
||||
case Some(EditControl.Initial(initial)) =>
|
||||
Return(initial.editTweetIds)
|
||||
case _ =>
|
||||
Throw(new Exception(s"EditControlInitial not found in $editControl"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
object TweetEditFailure {
|
||||
abstract class TweetEditException(msg: String) extends Exception(msg)
|
||||
|
||||
case object TweetEditGetInitialEditControlException
|
||||
extends TweetEditException("Initial EditControl not found")
|
||||
|
||||
case object TweetEditInvalidEditControlException
|
||||
extends TweetEditException("Invalid EditControl for initial_tweet")
|
||||
|
||||
case object TweetEditUpdateEditControlException
|
||||
extends TweetEditException("Invalid Edit Control Update")
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
package com.twitter.tweetypie.util
|
||||
|
||||
import com.twitter.finagle.Backoff
|
||||
import com.twitter.finagle.service.RetryPolicy
|
||||
import com.twitter.finagle.service.RetryPolicy.RetryableWriteException
|
||||
import com.twitter.servo.exception.thriftscala.ServerError
|
||||
import com.twitter.util.Duration
|
||||
import com.twitter.util.Throw
|
||||
import com.twitter.util.TimeoutException
|
||||
import com.twitter.util.Try
|
||||
|
||||
object RetryPolicyBuilder {
|
||||
|
||||
/**
|
||||
* Retry on any exception.
|
||||
*/
|
||||
def anyFailure[A](backoffs: Stream[Duration]): RetryPolicy[Try[A]] =
|
||||
RetryPolicy.backoff[Try[A]](Backoff.fromStream(backoffs)) {
|
||||
case Throw(_) => true
|
||||
}
|
||||
|
||||
/**
|
||||
* Retry on com.twitter.util.TimeoutException
|
||||
*/
|
||||
def timeouts[A](backoffs: Stream[Duration]): RetryPolicy[Try[A]] =
|
||||
RetryPolicy.backoff[Try[A]](Backoff.fromStream(backoffs)) {
|
||||
case Throw(_: TimeoutException) => true
|
||||
}
|
||||
|
||||
/**
|
||||
* Retry on com.twitter.finagle.service.RetryableWriteExceptions
|
||||
*/
|
||||
def writes[A](backoffs: Stream[Duration]): RetryPolicy[Try[A]] =
|
||||
RetryPolicy.backoff[Try[A]](Backoff.fromStream(backoffs)) {
|
||||
case Throw(RetryableWriteException(_)) => true
|
||||
}
|
||||
|
||||
/**
|
||||
* Retry on com.twitter.servo.exception.thriftscala.ServerError
|
||||
*/
|
||||
def servoServerError[A](backoffs: Stream[Duration]): RetryPolicy[Try[A]] =
|
||||
RetryPolicy.backoff[Try[A]](Backoff.fromStream(backoffs)) {
|
||||
case Throw(ServerError(_)) => true
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
package com.twitter.tweetypie.util
|
||||
|
||||
import com.twitter.finagle.stats.Stat
|
||||
import com.twitter.finagle.stats.StatsReceiver
|
||||
import com.twitter.servo
|
||||
import com.twitter.util.Return
|
||||
import com.twitter.util.Throw
|
||||
import com.twitter.stitch.Stitch
|
||||
|
||||
object StitchUtils {
|
||||
def trackLatency[T](latencyStat: Stat, s: => Stitch[T]): Stitch[T] = {
|
||||
Stitch
|
||||
.time(s)
|
||||
.map {
|
||||
case (res, duration) =>
|
||||
latencyStat.add(duration.inMillis)
|
||||
res
|
||||
}
|
||||
.lowerFromTry
|
||||
}
|
||||
|
||||
def observe[T](statsReceiver: StatsReceiver, apiName: String): Stitch[T] => Stitch[T] = {
|
||||
val stats = statsReceiver.scope(apiName)
|
||||
|
||||
val requests = stats.counter("requests")
|
||||
val success = stats.counter("success")
|
||||
val latencyStat = stats.stat("latency_ms")
|
||||
|
||||
val exceptionCounter =
|
||||
new servo.util.ExceptionCounter(stats, "failures")
|
||||
|
||||
stitch =>
|
||||
trackLatency(latencyStat, stitch)
|
||||
.respond {
|
||||
case Return(_) =>
|
||||
requests.incr()
|
||||
success.incr()
|
||||
|
||||
case Throw(e) =>
|
||||
exceptionCounter(e)
|
||||
requests.incr()
|
||||
}
|
||||
}
|
||||
|
||||
def translateExceptions[T](
|
||||
stitch: Stitch[T],
|
||||
translateException: PartialFunction[Throwable, Throwable]
|
||||
): Stitch[T] =
|
||||
stitch.rescue {
|
||||
case t if translateException.isDefinedAt(t) =>
|
||||
Stitch.exception(translateException(t))
|
||||
case t => Stitch.exception(t)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
package com.twitter.tweetypie.util
|
||||
|
||||
/**
|
||||
* Escape a String into Java or Scala String literal syntax (adds the
|
||||
* surrounding quotes.)
|
||||
*
|
||||
* This is primarily for printing Strings for debugging or logging.
|
||||
*/
|
||||
object StringLiteral extends (String => String) {
|
||||
private[this] val ControlLimit = ' '
|
||||
private[this] val PrintableLimit = '\u007e'
|
||||
private[this] val Specials =
|
||||
Map('\n' -> 'n', '\r' -> 'r', '\t' -> 't', '"' -> '"', '\'' -> '\'', '\\' -> '\\')
|
||||
|
||||
def apply(str: String): String = {
|
||||
val s = new StringBuilder(str.length)
|
||||
s.append('"')
|
||||
var i = 0
|
||||
while (i < str.length) {
|
||||
val c = str(i)
|
||||
Specials.get(c) match {
|
||||
case None =>
|
||||
if (c >= ControlLimit && c <= PrintableLimit) s.append(c)
|
||||
else s.append("\\u%04x".format(c.toInt))
|
||||
case Some(special) => s.append('\\').append(special)
|
||||
}
|
||||
i += 1
|
||||
}
|
||||
s.append('"').result
|
||||
}
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
package com.twitter.tweetypie.util
|
||||
|
||||
import com.twitter.takedown.util.TakedownReasons
|
||||
import com.twitter.takedown.util.TakedownReasons.CountryCode
|
||||
import com.twitter.tseng.withholding.thriftscala.TakedownReason
|
||||
import com.twitter.tseng.withholding.thriftscala.UnspecifiedReason
|
||||
import com.twitter.tweetypie.thriftscala.Tweet
|
||||
|
||||
/**
|
||||
* Contains tweetypie-specific utils for working with TakedownReasons.
|
||||
*/
|
||||
object Takedowns {
|
||||
|
||||
type CountryCode = String
|
||||
|
||||
/**
|
||||
* Take a list of [[TakedownReason]] and return values to be saved on the [[Tweet]] in fields
|
||||
* tweetypieOnlyTakedownCountryCode and tweetypieOnlyTakedownReason.
|
||||
*
|
||||
* - tweetypieOnlyTakedownCountryCode contains the country_code of all UnspecifiedReasons
|
||||
* - tweetypieOnlyTakedownReason contains all other reasons
|
||||
*/
|
||||
def partitionReasons(reasons: Seq[TakedownReason]): (Seq[String], Seq[TakedownReason]) = {
|
||||
val (unspecifiedReasons, specifiedReasons) = reasons.partition {
|
||||
case TakedownReason.UnspecifiedReason(UnspecifiedReason(_)) => true
|
||||
case _ => false
|
||||
}
|
||||
val unspecifiedCountryCodes = unspecifiedReasons.collect(TakedownReasons.reasonToCountryCode)
|
||||
(unspecifiedCountryCodes, specifiedReasons)
|
||||
}
|
||||
|
||||
def fromTweet(t: Tweet): Takedowns =
|
||||
Takedowns(
|
||||
Seq
|
||||
.concat(
|
||||
t.tweetypieOnlyTakedownCountryCodes
|
||||
.getOrElse(Nil).map(TakedownReasons.countryCodeToReason),
|
||||
t.tweetypieOnlyTakedownReasons.getOrElse(Nil)
|
||||
).toSet
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* This class is used to ensure the caller has access to both the full list of reasons as well
|
||||
* as the backwards-compatible list of country codes.
|
||||
*/
|
||||
case class Takedowns(reasons: Set[TakedownReason]) {
|
||||
def countryCodes: Set[CountryCode] = reasons.collect(TakedownReasons.reasonToCountryCode)
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
package com.twitter.tweetypie.util
|
||||
|
||||
import com.twitter.tweetypie.thriftscala.TransientCreateContext
|
||||
import com.twitter.tweetypie.thriftscala.TweetCreateContextKey
|
||||
import com.twitter.tweetypie.thriftscala.TweetCreateContextKey.PeriscopeCreatorId
|
||||
import com.twitter.tweetypie.thriftscala.TweetCreateContextKey.PeriscopeIsLive
|
||||
|
||||
object TransientContextUtil {
|
||||
|
||||
def toAdditionalContext(context: TransientCreateContext): Map[TweetCreateContextKey, String] =
|
||||
Seq
|
||||
.concat(
|
||||
context.periscopeIsLive.map(PeriscopeIsLive -> _.toString), // "true" or "false"
|
||||
context.periscopeCreatorId.map(PeriscopeCreatorId -> _.toString) // userId
|
||||
)
|
||||
.toMap
|
||||
}
|
|
@ -0,0 +1,203 @@
|
|||
package com.twitter.tweetypie.util
|
||||
|
||||
import com.twitter.conversions.DurationOps._
|
||||
import com.twitter.logging.Logger
|
||||
import com.twitter.mediaservices.commons.mediainformation.thriftscala.UserDefinedProductMetadata
|
||||
import com.twitter.scrooge.BinaryThriftStructSerializer
|
||||
import com.twitter.servo.cache.ScopedCacheKey
|
||||
import com.twitter.servo.util.Transformer
|
||||
import com.twitter.tweetypie.thriftscala.PostTweetRequest
|
||||
import com.twitter.util.Base64Long
|
||||
import com.twitter.util.Time
|
||||
import java.nio.ByteBuffer
|
||||
import java.security.MessageDigest
|
||||
import org.apache.commons.codec.binary.Base64
|
||||
import scala.collection.immutable.SortedMap
|
||||
|
||||
object TweetCreationLock {
|
||||
case class Key private (userId: UserId, typeCode: String, idOrMd5: String)
|
||||
extends ScopedCacheKey("t", "locker", 2, Base64Long.toBase64(userId), typeCode, idOrMd5) {
|
||||
def uniquenessId: Option[String] =
|
||||
if (typeCode == Key.TypeCode.UniquenessId) Some(idOrMd5) else None
|
||||
}
|
||||
|
||||
object Key {
|
||||
private[this] val log = Logger(getClass)
|
||||
|
||||
object TypeCode {
|
||||
val SourceTweetId = "r"
|
||||
val UniquenessId = "u"
|
||||
val PostTweetRequest = "p"
|
||||
}
|
||||
|
||||
private[this] val serializer = BinaryThriftStructSerializer(PostTweetRequest)
|
||||
|
||||
// normalize the representation of no media ids.
|
||||
private[util] def sanitizeMediaUploadIds(mediaUploadIds: Option[Seq[Long]]) =
|
||||
mediaUploadIds.filter(_.nonEmpty)
|
||||
|
||||
/**
|
||||
* Request deduplication depends on the hash of a serialized Thrift value.
|
||||
*
|
||||
* In order to guarantee that a Map has a reproducible serialized form,
|
||||
* it's necessary to fix the ordering of its keys.
|
||||
*/
|
||||
private[util] def sanitizeMediaMetadata(
|
||||
mediaMetadata: Option[scala.collection.Map[MediaId, UserDefinedProductMetadata]]
|
||||
): Option[scala.collection.Map[MediaId, UserDefinedProductMetadata]] =
|
||||
mediaMetadata.map(m => SortedMap(m.toSeq: _*))
|
||||
|
||||
/**
|
||||
* Make sure to sanitize request fields with map/set since serialized
|
||||
* bytes ordering is not guaranteed for same thrift values.
|
||||
*/
|
||||
private[util] def sanitizeRequest(request: PostTweetRequest): PostTweetRequest =
|
||||
PostTweetRequest(
|
||||
userId = request.userId,
|
||||
text = request.text,
|
||||
createdVia = "",
|
||||
inReplyToTweetId = request.inReplyToTweetId,
|
||||
geo = request.geo,
|
||||
mediaUploadIds = sanitizeMediaUploadIds(request.mediaUploadIds),
|
||||
narrowcast = request.narrowcast,
|
||||
nullcast = request.nullcast,
|
||||
additionalFields = request.additionalFields,
|
||||
attachmentUrl = request.attachmentUrl,
|
||||
mediaMetadata = sanitizeMediaMetadata(request.mediaMetadata),
|
||||
conversationControl = request.conversationControl,
|
||||
underlyingCreativesContainerId = request.underlyingCreativesContainerId,
|
||||
editOptions = request.editOptions,
|
||||
noteTweetOptions = request.noteTweetOptions
|
||||
)
|
||||
|
||||
def bySourceTweetId(userId: UserId, sourceTweetId: TweetId): Key =
|
||||
Key(userId, TypeCode.SourceTweetId, Base64Long.toBase64(sourceTweetId))
|
||||
|
||||
def byRequest(request: PostTweetRequest): Key =
|
||||
request.uniquenessId match {
|
||||
case Some(uqid) =>
|
||||
byUniquenessId(request.userId, uqid)
|
||||
case None =>
|
||||
val sanitized = sanitizeRequest(request)
|
||||
val sanitizedBytes = serializer.toBytes(sanitized)
|
||||
val digested = MessageDigest.getInstance("SHA-256").digest(sanitizedBytes)
|
||||
val base64Digest = Base64.encodeBase64String(digested)
|
||||
val key = Key(request.userId, TypeCode.PostTweetRequest, base64Digest)
|
||||
log.ifDebug(s"Generated key $key from request:\n${sanitized}")
|
||||
key
|
||||
}
|
||||
|
||||
/**
|
||||
* Key for tweets that have a uniqueness id set. There is only one
|
||||
* namespace of uniqueness ids, across all clients. They are
|
||||
* expected to be Snowflake ids, in order to avoid cache
|
||||
* collisions.
|
||||
*/
|
||||
def byUniquenessId(userId: UserId, uniquenessId: Long): Key =
|
||||
Key(userId, TypeCode.UniquenessId, Base64Long.toBase64(uniquenessId))
|
||||
}
|
||||
|
||||
/**
|
||||
* The state of tweet creation for a given Key (request).
|
||||
*/
|
||||
sealed trait State
|
||||
|
||||
object State {
|
||||
|
||||
/**
|
||||
* There is no tweet creation currently in progress. (This can
|
||||
* either be represented by no entry in the cache, or this special
|
||||
* marker. This lets us use checkAndSet for deletion to avoid
|
||||
* accidentally overwriting other process' values.)
|
||||
*/
|
||||
case object Unlocked extends State
|
||||
|
||||
/**
|
||||
* Some process is attempting to create the tweet.
|
||||
*/
|
||||
case class InProgress(token: Long, timestamp: Time) extends State
|
||||
|
||||
/**
|
||||
* The tweet has already been successfully created, and has the
|
||||
* specified id.
|
||||
*/
|
||||
case class AlreadyCreated(tweetId: TweetId, timestamp: Time) extends State
|
||||
|
||||
/**
|
||||
* When stored in cache, each state is prefixed by a byte
|
||||
* indicating the type of the entry.
|
||||
*/
|
||||
object TypeCode {
|
||||
val Unlocked: Byte = 0.toByte
|
||||
val InProgress: Byte = 1.toByte // + random long + timestamp
|
||||
val AlreadyCreated: Byte = 2.toByte // + tweet id + timestamp
|
||||
}
|
||||
|
||||
private[this] val BufferSize = 17 // type byte + 64-bit value + 64-bit timestamp
|
||||
|
||||
// Constant buffer to use for storing the serialized form on
|
||||
// Unlocked.
|
||||
private[this] val UnlockedBuf = Array[Byte](TypeCode.Unlocked)
|
||||
|
||||
// Store the serialization function in a ThreadLocal so that we can
|
||||
// reuse the buffer between invocations.
|
||||
private[this] val threadLocalSerialize = new ThreadLocal[State => Array[Byte]] {
|
||||
override def initialValue(): State => Array[Byte] = {
|
||||
// Allocate the thread-local state
|
||||
val ary = new Array[Byte](BufferSize)
|
||||
val buf = ByteBuffer.wrap(ary)
|
||||
|
||||
{
|
||||
case Unlocked => UnlockedBuf
|
||||
case InProgress(token, timestamp) =>
|
||||
buf.clear()
|
||||
buf
|
||||
.put(TypeCode.InProgress)
|
||||
.putLong(token)
|
||||
.putLong(timestamp.sinceEpoch.inNanoseconds)
|
||||
ary
|
||||
case AlreadyCreated(tweetId, timestamp) =>
|
||||
buf.clear()
|
||||
buf
|
||||
.put(TypeCode.AlreadyCreated)
|
||||
.putLong(tweetId)
|
||||
.putLong(timestamp.sinceEpoch.inNanoseconds)
|
||||
ary
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert this State to the cache representation.
|
||||
*/
|
||||
private[this] def toBytes(state: State): Array[Byte] =
|
||||
threadLocalSerialize.get()(state)
|
||||
|
||||
/**
|
||||
* Convert this byte array into a LockState.
|
||||
*
|
||||
* @throws RuntimeException if the buffer is not of the right size
|
||||
* and format
|
||||
*/
|
||||
private[this] def fromBytes(bytes: Array[Byte]): State = {
|
||||
val buf = ByteBuffer.wrap(bytes)
|
||||
val result = buf.get() match {
|
||||
case TypeCode.Unlocked => Unlocked
|
||||
case TypeCode.InProgress => InProgress(buf.getLong(), buf.getLong().nanoseconds.afterEpoch)
|
||||
case TypeCode.AlreadyCreated =>
|
||||
AlreadyCreated(buf.getLong(), buf.getLong().nanoseconds.afterEpoch)
|
||||
case other => throw new RuntimeException("Invalid type code: " + other)
|
||||
}
|
||||
if (buf.remaining != 0) {
|
||||
throw new RuntimeException("Extra data in buffer: " + bytes)
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
/**
|
||||
* How to serialize the State for storage in cache.
|
||||
*/
|
||||
val Serializer: Transformer[State, Array[Byte]] =
|
||||
Transformer[State, Array[Byte]](tTo = toBytes _, tFrom = fromBytes _)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,506 @@
|
|||
package com.twitter.tweetypie.util
|
||||
|
||||
import com.twitter.dataproducts.enrichments.thriftscala.ProfileGeoEnrichment
|
||||
import com.twitter.expandodo.thriftscala._
|
||||
import com.twitter.mediaservices.commons.thriftscala.MediaKey
|
||||
import com.twitter.mediaservices.commons.tweetmedia.thriftscala._
|
||||
import com.twitter.servo.data.Lens
|
||||
import com.twitter.spam.rtf.thriftscala.SafetyLabel
|
||||
import com.twitter.tseng.withholding.thriftscala.TakedownReason
|
||||
import com.twitter.tweetypie.thriftscala._
|
||||
import com.twitter.tweetypie.unmentions.thriftscala.UnmentionData
|
||||
|
||||
object TweetLenses {
|
||||
import Lens.checkEq
|
||||
|
||||
def requireSome[A, B](l: Lens[A, Option[B]]): Lens[A, B] =
|
||||
checkEq[A, B](
|
||||
a => l.get(a).get,
|
||||
(a, b) => l.set(a, Some(b))
|
||||
)
|
||||
|
||||
def tweetLens[A](get: Tweet => A, set: (Tweet, A) => Tweet): Lens[Tweet, A] =
|
||||
checkEq[Tweet, A](get, set)
|
||||
|
||||
val id: Lens[Tweet, TweetId] =
|
||||
tweetLens[TweetId](_.id, (t, id) => t.copy(id = id))
|
||||
|
||||
val coreData: Lens[Tweet, Option[TweetCoreData]] =
|
||||
tweetLens[Option[TweetCoreData]](_.coreData, (t, coreData) => t.copy(coreData = coreData))
|
||||
|
||||
val requiredCoreData: Lens[Tweet, TweetCoreData] =
|
||||
requireSome(coreData)
|
||||
|
||||
val optUrls: Lens[Tweet, Option[Seq[UrlEntity]]] =
|
||||
tweetLens[Option[Seq[UrlEntity]]](_.urls, (t, urls) => t.copy(urls = urls))
|
||||
|
||||
val urls: Lens[Tweet, Seq[UrlEntity]] =
|
||||
tweetLens[Seq[UrlEntity]](_.urls.toSeq.flatten, (t, urls) => t.copy(urls = Some(urls)))
|
||||
|
||||
val optMentions: Lens[Tweet, Option[Seq[MentionEntity]]] =
|
||||
tweetLens[Option[Seq[MentionEntity]]](_.mentions, (t, v) => t.copy(mentions = v))
|
||||
|
||||
val mentions: Lens[Tweet, Seq[MentionEntity]] =
|
||||
tweetLens[Seq[MentionEntity]](_.mentions.toSeq.flatten, (t, v) => t.copy(mentions = Some(v)))
|
||||
|
||||
val unmentionData: Lens[Tweet, Option[UnmentionData]] =
|
||||
tweetLens[Option[UnmentionData]](_.unmentionData, (t, v) => t.copy(unmentionData = v))
|
||||
|
||||
val optHashtags: Lens[Tweet, Option[Seq[HashtagEntity]]] =
|
||||
tweetLens[Option[Seq[HashtagEntity]]](_.hashtags, (t, v) => t.copy(hashtags = v))
|
||||
|
||||
val hashtags: Lens[Tweet, Seq[HashtagEntity]] =
|
||||
tweetLens[Seq[HashtagEntity]](_.hashtags.toSeq.flatten, (t, v) => t.copy(hashtags = Some(v)))
|
||||
|
||||
val optCashtags: Lens[Tweet, Option[Seq[CashtagEntity]]] =
|
||||
tweetLens[Option[Seq[CashtagEntity]]](_.cashtags, (t, v) => t.copy(cashtags = v))
|
||||
|
||||
val cashtags: Lens[Tweet, Seq[CashtagEntity]] =
|
||||
tweetLens[Seq[CashtagEntity]](_.cashtags.toSeq.flatten, (t, v) => t.copy(cashtags = Some(v)))
|
||||
|
||||
val optMedia: Lens[Tweet, Option[Seq[MediaEntity]]] =
|
||||
tweetLens[Option[Seq[MediaEntity]]](_.media, (t, v) => t.copy(media = v))
|
||||
|
||||
val media: Lens[Tweet, Seq[MediaEntity]] =
|
||||
tweetLens[Seq[MediaEntity]](_.media.toSeq.flatten, (t, v) => t.copy(media = Some(v)))
|
||||
|
||||
val mediaKeys: Lens[Tweet, Seq[MediaKey]] =
|
||||
tweetLens[Seq[MediaKey]](
|
||||
_.mediaKeys.toSeq.flatten,
|
||||
{
|
||||
case (t, v) => t.copy(mediaKeys = Some(v))
|
||||
})
|
||||
|
||||
val place: Lens[Tweet, Option[Place]] =
|
||||
tweetLens[Option[Place]](
|
||||
_.place,
|
||||
{
|
||||
case (t, v) => t.copy(place = v)
|
||||
})
|
||||
|
||||
val quotedTweet: Lens[Tweet, Option[QuotedTweet]] =
|
||||
tweetLens[Option[QuotedTweet]](
|
||||
_.quotedTweet,
|
||||
{
|
||||
case (t, v) => t.copy(quotedTweet = v)
|
||||
})
|
||||
|
||||
val selfThreadMetadata: Lens[Tweet, Option[SelfThreadMetadata]] =
|
||||
tweetLens[Option[SelfThreadMetadata]](
|
||||
_.selfThreadMetadata,
|
||||
{
|
||||
case (t, v) => t.copy(selfThreadMetadata = v)
|
||||
})
|
||||
|
||||
val composerSource: Lens[Tweet, Option[ComposerSource]] =
|
||||
tweetLens[Option[ComposerSource]](
|
||||
_.composerSource,
|
||||
{
|
||||
case (t, v) => t.copy(composerSource = v)
|
||||
})
|
||||
|
||||
val deviceSource: Lens[Tweet, Option[DeviceSource]] =
|
||||
tweetLens[Option[DeviceSource]](
|
||||
_.deviceSource,
|
||||
{
|
||||
case (t, v) => t.copy(deviceSource = v)
|
||||
})
|
||||
|
||||
val perspective: Lens[Tweet, Option[StatusPerspective]] =
|
||||
tweetLens[Option[StatusPerspective]](
|
||||
_.perspective,
|
||||
{
|
||||
case (t, v) => t.copy(perspective = v)
|
||||
})
|
||||
|
||||
val cards: Lens[Tweet, Option[Seq[Card]]] =
|
||||
tweetLens[Option[Seq[Card]]](
|
||||
_.cards,
|
||||
{
|
||||
case (t, v) => t.copy(cards = v)
|
||||
})
|
||||
|
||||
val card2: Lens[Tweet, Option[Card2]] =
|
||||
tweetLens[Option[Card2]](
|
||||
_.card2,
|
||||
{
|
||||
case (t, v) => t.copy(card2 = v)
|
||||
})
|
||||
|
||||
val cardReference: Lens[Tweet, Option[CardReference]] =
|
||||
tweetLens[Option[CardReference]](
|
||||
_.cardReference,
|
||||
{
|
||||
case (t, v) => t.copy(cardReference = v)
|
||||
})
|
||||
|
||||
val spamLabel: Lens[Tweet, Option[SafetyLabel]] =
|
||||
tweetLens[Option[SafetyLabel]](
|
||||
_.spamLabel,
|
||||
{
|
||||
case (t, v) => t.copy(spamLabel = v)
|
||||
})
|
||||
|
||||
val lowQualityLabel: Lens[Tweet, Option[SafetyLabel]] =
|
||||
tweetLens[Option[SafetyLabel]](
|
||||
_.lowQualityLabel,
|
||||
{
|
||||
case (t, v) => t.copy(lowQualityLabel = v)
|
||||
})
|
||||
|
||||
val nsfwHighPrecisionLabel: Lens[Tweet, Option[SafetyLabel]] =
|
||||
tweetLens[Option[SafetyLabel]](
|
||||
_.nsfwHighPrecisionLabel,
|
||||
{
|
||||
case (t, v) => t.copy(nsfwHighPrecisionLabel = v)
|
||||
})
|
||||
|
||||
val bounceLabel: Lens[Tweet, Option[SafetyLabel]] =
|
||||
tweetLens[Option[SafetyLabel]](
|
||||
_.bounceLabel,
|
||||
{
|
||||
case (t, v) => t.copy(bounceLabel = v)
|
||||
})
|
||||
|
||||
val takedownCountryCodes: Lens[Tweet, Option[Seq[String]]] =
|
||||
tweetLens[Option[Seq[String]]](
|
||||
_.takedownCountryCodes,
|
||||
{
|
||||
case (t, v) => t.copy(takedownCountryCodes = v)
|
||||
})
|
||||
|
||||
val takedownReasons: Lens[Tweet, Option[Seq[TakedownReason]]] =
|
||||
tweetLens[Option[Seq[TakedownReason]]](
|
||||
_.takedownReasons,
|
||||
{
|
||||
case (t, v) => t.copy(takedownReasons = v)
|
||||
})
|
||||
|
||||
val contributor: Lens[Tweet, Option[Contributor]] =
|
||||
tweetLens[Option[Contributor]](
|
||||
_.contributor,
|
||||
{
|
||||
case (t, v) => t.copy(contributor = v)
|
||||
})
|
||||
|
||||
val mediaTags: Lens[Tweet, Option[TweetMediaTags]] =
|
||||
tweetLens[Option[TweetMediaTags]](
|
||||
_.mediaTags,
|
||||
{
|
||||
case (t, v) => t.copy(mediaTags = v)
|
||||
})
|
||||
|
||||
val mediaTagMap: Lens[Tweet, Map[MediaId, Seq[MediaTag]]] =
|
||||
tweetLens[Map[MediaId, Seq[MediaTag]]](
|
||||
_.mediaTags.map { case TweetMediaTags(tagMap) => tagMap.toMap }.getOrElse(Map.empty),
|
||||
(t, v) => {
|
||||
val cleanMap = v.filter { case (_, tags) => tags.nonEmpty }
|
||||
t.copy(mediaTags = if (cleanMap.nonEmpty) Some(TweetMediaTags(cleanMap)) else None)
|
||||
}
|
||||
)
|
||||
|
||||
val escherbirdEntityAnnotations: Lens[Tweet, Option[EscherbirdEntityAnnotations]] =
|
||||
tweetLens[Option[EscherbirdEntityAnnotations]](
|
||||
_.escherbirdEntityAnnotations,
|
||||
{
|
||||
case (t, v) => t.copy(escherbirdEntityAnnotations = v)
|
||||
})
|
||||
|
||||
val communities: Lens[Tweet, Option[Communities]] =
|
||||
tweetLens[Option[Communities]](
|
||||
_.communities,
|
||||
{
|
||||
case (t, v) => t.copy(communities = v)
|
||||
})
|
||||
|
||||
val tweetypieOnlyTakedownCountryCodes: Lens[Tweet, Option[Seq[String]]] =
|
||||
tweetLens[Option[Seq[String]]](
|
||||
_.tweetypieOnlyTakedownCountryCodes,
|
||||
{
|
||||
case (t, v) => t.copy(tweetypieOnlyTakedownCountryCodes = v)
|
||||
})
|
||||
|
||||
val tweetypieOnlyTakedownReasons: Lens[Tweet, Option[Seq[TakedownReason]]] =
|
||||
tweetLens[Option[Seq[TakedownReason]]](
|
||||
_.tweetypieOnlyTakedownReasons,
|
||||
{
|
||||
case (t, v) => t.copy(tweetypieOnlyTakedownReasons = v)
|
||||
})
|
||||
|
||||
val profileGeo: Lens[Tweet, Option[ProfileGeoEnrichment]] =
|
||||
tweetLens[Option[ProfileGeoEnrichment]](
|
||||
_.profileGeoEnrichment,
|
||||
(t, v) => t.copy(profileGeoEnrichment = v)
|
||||
)
|
||||
|
||||
val visibleTextRange: Lens[Tweet, Option[TextRange]] =
|
||||
tweetLens[Option[TextRange]](
|
||||
_.visibleTextRange,
|
||||
{
|
||||
case (t, v) => t.copy(visibleTextRange = v)
|
||||
})
|
||||
|
||||
val selfPermalink: Lens[Tweet, Option[ShortenedUrl]] =
|
||||
tweetLens[Option[ShortenedUrl]](
|
||||
_.selfPermalink,
|
||||
{
|
||||
case (t, v) => t.copy(selfPermalink = v)
|
||||
})
|
||||
|
||||
val extendedTweetMetadata: Lens[Tweet, Option[ExtendedTweetMetadata]] =
|
||||
tweetLens[Option[ExtendedTweetMetadata]](
|
||||
_.extendedTweetMetadata,
|
||||
{
|
||||
case (t, v) => t.copy(extendedTweetMetadata = v)
|
||||
})
|
||||
|
||||
object TweetCoreData {
|
||||
val userId: Lens[TweetCoreData, UserId] = checkEq[TweetCoreData, UserId](
|
||||
_.userId,
|
||||
{ (c, v) =>
|
||||
// Pleases the compiler: https://github.com/scala/bug/issues/9171
|
||||
val userId = v
|
||||
c.copy(userId = userId)
|
||||
})
|
||||
val text: Lens[TweetCoreData, String] = checkEq[TweetCoreData, String](
|
||||
_.text,
|
||||
{ (c, v) =>
|
||||
// Pleases the compiler: https://github.com/scala/bug/issues/9171
|
||||
val text = v
|
||||
c.copy(text = text)
|
||||
})
|
||||
val createdAt: Lens[TweetCoreData, TweetId] =
|
||||
checkEq[TweetCoreData, Long](_.createdAtSecs, (c, v) => c.copy(createdAtSecs = v))
|
||||
val createdVia: Lens[TweetCoreData, String] =
|
||||
checkEq[TweetCoreData, String](
|
||||
_.createdVia,
|
||||
{
|
||||
case (c, v) => c.copy(createdVia = v)
|
||||
})
|
||||
val hasTakedown: Lens[TweetCoreData, Boolean] =
|
||||
checkEq[TweetCoreData, Boolean](
|
||||
_.hasTakedown,
|
||||
{
|
||||
case (c, v) => c.copy(hasTakedown = v)
|
||||
})
|
||||
val nullcast: Lens[TweetCoreData, Boolean] =
|
||||
checkEq[TweetCoreData, Boolean](
|
||||
_.nullcast,
|
||||
{
|
||||
case (c, v) => c.copy(nullcast = v)
|
||||
})
|
||||
val nsfwUser: Lens[TweetCoreData, Boolean] =
|
||||
checkEq[TweetCoreData, Boolean](
|
||||
_.nsfwUser,
|
||||
{
|
||||
case (c, v) => c.copy(nsfwUser = v)
|
||||
})
|
||||
val nsfwAdmin: Lens[TweetCoreData, Boolean] =
|
||||
checkEq[TweetCoreData, Boolean](
|
||||
_.nsfwAdmin,
|
||||
{
|
||||
case (c, v) => c.copy(nsfwAdmin = v)
|
||||
})
|
||||
val reply: Lens[TweetCoreData, Option[Reply]] =
|
||||
checkEq[TweetCoreData, Option[Reply]](
|
||||
_.reply,
|
||||
{
|
||||
case (c, v) => c.copy(reply = v)
|
||||
})
|
||||
val share: Lens[TweetCoreData, Option[Share]] =
|
||||
checkEq[TweetCoreData, Option[Share]](
|
||||
_.share,
|
||||
{
|
||||
case (c, v) => c.copy(share = v)
|
||||
})
|
||||
val narrowcast: Lens[TweetCoreData, Option[Narrowcast]] =
|
||||
checkEq[TweetCoreData, Option[Narrowcast]](
|
||||
_.narrowcast,
|
||||
{
|
||||
case (c, v) => c.copy(narrowcast = v)
|
||||
})
|
||||
val directedAtUser: Lens[TweetCoreData, Option[DirectedAtUser]] =
|
||||
checkEq[TweetCoreData, Option[DirectedAtUser]](
|
||||
_.directedAtUser,
|
||||
{
|
||||
case (c, v) => c.copy(directedAtUser = v)
|
||||
})
|
||||
val conversationId: Lens[TweetCoreData, Option[ConversationId]] =
|
||||
checkEq[TweetCoreData, Option[ConversationId]](
|
||||
_.conversationId,
|
||||
{
|
||||
case (c, v) => c.copy(conversationId = v)
|
||||
})
|
||||
val placeId: Lens[TweetCoreData, Option[String]] =
|
||||
checkEq[TweetCoreData, Option[String]](
|
||||
_.placeId,
|
||||
{
|
||||
case (c, v) => c.copy(placeId = v)
|
||||
})
|
||||
val geoCoordinates: Lens[TweetCoreData, Option[GeoCoordinates]] =
|
||||
checkEq[TweetCoreData, Option[GeoCoordinates]](
|
||||
_.coordinates,
|
||||
(c, v) => c.copy(coordinates = v)
|
||||
)
|
||||
val trackingId: Lens[TweetCoreData, Option[TweetId]] =
|
||||
checkEq[TweetCoreData, Option[Long]](
|
||||
_.trackingId,
|
||||
{
|
||||
case (c, v) => c.copy(trackingId = v)
|
||||
})
|
||||
val hasMedia: Lens[TweetCoreData, Option[Boolean]] =
|
||||
checkEq[TweetCoreData, Option[Boolean]](
|
||||
_.hasMedia,
|
||||
{
|
||||
case (c, v) => c.copy(hasMedia = v)
|
||||
})
|
||||
}
|
||||
|
||||
val counts: Lens[Tweet, Option[StatusCounts]] =
|
||||
tweetLens[Option[StatusCounts]](
|
||||
_.counts,
|
||||
{
|
||||
case (t, v) => t.copy(counts = v)
|
||||
})
|
||||
|
||||
object StatusCounts {
|
||||
val retweetCount: Lens[StatusCounts, Option[TweetId]] =
|
||||
checkEq[StatusCounts, Option[Long]](
|
||||
_.retweetCount,
|
||||
(c, retweetCount) => c.copy(retweetCount = retweetCount)
|
||||
)
|
||||
|
||||
val replyCount: Lens[StatusCounts, Option[TweetId]] =
|
||||
checkEq[StatusCounts, Option[Long]](
|
||||
_.replyCount,
|
||||
(c, replyCount) => c.copy(replyCount = replyCount)
|
||||
)
|
||||
|
||||
val favoriteCount: Lens[StatusCounts, Option[TweetId]] =
|
||||
checkEq[StatusCounts, Option[Long]](
|
||||
_.favoriteCount,
|
||||
{
|
||||
case (c, v) => c.copy(favoriteCount = v)
|
||||
})
|
||||
|
||||
val quoteCount: Lens[StatusCounts, Option[TweetId]] =
|
||||
checkEq[StatusCounts, Option[Long]](
|
||||
_.quoteCount,
|
||||
{
|
||||
case (c, v) => c.copy(quoteCount = v)
|
||||
})
|
||||
}
|
||||
|
||||
val userId: Lens[Tweet, UserId] = requiredCoreData andThen TweetCoreData.userId
|
||||
val text: Lens[Tweet, String] = requiredCoreData andThen TweetCoreData.text
|
||||
val createdVia: Lens[Tweet, String] = requiredCoreData andThen TweetCoreData.createdVia
|
||||
val createdAt: Lens[Tweet, ConversationId] = requiredCoreData andThen TweetCoreData.createdAt
|
||||
val reply: Lens[Tweet, Option[Reply]] = requiredCoreData andThen TweetCoreData.reply
|
||||
val share: Lens[Tweet, Option[Share]] = requiredCoreData andThen TweetCoreData.share
|
||||
val narrowcast: Lens[Tweet, Option[Narrowcast]] =
|
||||
requiredCoreData andThen TweetCoreData.narrowcast
|
||||
val directedAtUser: Lens[Tweet, Option[DirectedAtUser]] =
|
||||
requiredCoreData andThen TweetCoreData.directedAtUser
|
||||
val conversationId: Lens[Tweet, Option[ConversationId]] =
|
||||
requiredCoreData andThen TweetCoreData.conversationId
|
||||
val placeId: Lens[Tweet, Option[String]] = requiredCoreData andThen TweetCoreData.placeId
|
||||
val geoCoordinates: Lens[Tweet, Option[GeoCoordinates]] =
|
||||
requiredCoreData andThen TweetCoreData.geoCoordinates
|
||||
val hasTakedown: Lens[Tweet, Boolean] = requiredCoreData andThen TweetCoreData.hasTakedown
|
||||
val nsfwAdmin: Lens[Tweet, Boolean] = requiredCoreData andThen TweetCoreData.nsfwAdmin
|
||||
val nsfwUser: Lens[Tweet, Boolean] = requiredCoreData andThen TweetCoreData.nsfwUser
|
||||
val nullcast: Lens[Tweet, Boolean] = requiredCoreData andThen TweetCoreData.nullcast
|
||||
val trackingId: Lens[Tweet, Option[ConversationId]] =
|
||||
requiredCoreData andThen TweetCoreData.trackingId
|
||||
val hasMedia: Lens[Tweet, Option[Boolean]] = requiredCoreData andThen TweetCoreData.hasMedia
|
||||
|
||||
object CashtagEntity {
|
||||
val indices: Lens[CashtagEntity, (Short, Short)] =
|
||||
checkEq[CashtagEntity, (Short, Short)](
|
||||
t => (t.fromIndex, t.toIndex),
|
||||
(t, v) => t.copy(fromIndex = v._1, toIndex = v._2)
|
||||
)
|
||||
val text: Lens[CashtagEntity, String] =
|
||||
checkEq[CashtagEntity, String](_.text, (t, text) => t.copy(text = text))
|
||||
}
|
||||
|
||||
object HashtagEntity {
|
||||
val indices: Lens[HashtagEntity, (Short, Short)] =
|
||||
checkEq[HashtagEntity, (Short, Short)](
|
||||
t => (t.fromIndex, t.toIndex),
|
||||
(t, v) => t.copy(fromIndex = v._1, toIndex = v._2)
|
||||
)
|
||||
val text: Lens[HashtagEntity, String] =
|
||||
checkEq[HashtagEntity, String](_.text, (t, text) => t.copy(text = text))
|
||||
}
|
||||
|
||||
object MediaEntity {
|
||||
val indices: Lens[MediaEntity, (Short, Short)] =
|
||||
checkEq[MediaEntity, (Short, Short)](
|
||||
t => (t.fromIndex, t.toIndex),
|
||||
(t, v) => t.copy(fromIndex = v._1, toIndex = v._2)
|
||||
)
|
||||
val mediaSizes: Lens[MediaEntity, collection.Set[MediaSize]] =
|
||||
checkEq[MediaEntity, scala.collection.Set[MediaSize]](
|
||||
_.sizes,
|
||||
(m, sizes) => m.copy(sizes = sizes)
|
||||
)
|
||||
val url: Lens[MediaEntity, String] =
|
||||
checkEq[MediaEntity, String](
|
||||
_.url,
|
||||
{
|
||||
case (t, v) => t.copy(url = v)
|
||||
})
|
||||
val mediaInfo: Lens[MediaEntity, Option[MediaInfo]] =
|
||||
checkEq[MediaEntity, Option[MediaInfo]](
|
||||
_.mediaInfo,
|
||||
{
|
||||
case (t, v) => t.copy(mediaInfo = v)
|
||||
})
|
||||
}
|
||||
|
||||
object MentionEntity {
|
||||
val indices: Lens[MentionEntity, (Short, Short)] =
|
||||
checkEq[MentionEntity, (Short, Short)](
|
||||
t => (t.fromIndex, t.toIndex),
|
||||
(t, v) => t.copy(fromIndex = v._1, toIndex = v._2)
|
||||
)
|
||||
val screenName: Lens[MentionEntity, String] =
|
||||
checkEq[MentionEntity, String](
|
||||
_.screenName,
|
||||
(t, screenName) => t.copy(screenName = screenName)
|
||||
)
|
||||
}
|
||||
|
||||
object UrlEntity {
|
||||
val indices: Lens[UrlEntity, (Short, Short)] =
|
||||
checkEq[UrlEntity, (Short, Short)](
|
||||
t => (t.fromIndex, t.toIndex),
|
||||
(t, v) => t.copy(fromIndex = v._1, toIndex = v._2)
|
||||
)
|
||||
val url: Lens[UrlEntity, String] =
|
||||
checkEq[UrlEntity, String](_.url, (t, url) => t.copy(url = url))
|
||||
}
|
||||
|
||||
object Contributor {
|
||||
val screenName: Lens[Contributor, Option[String]] =
|
||||
checkEq[Contributor, Option[String]](
|
||||
_.screenName,
|
||||
(c, screenName) => c.copy(screenName = screenName)
|
||||
)
|
||||
}
|
||||
|
||||
object Reply {
|
||||
val inReplyToScreenName: Lens[Reply, Option[String]] =
|
||||
checkEq[Reply, Option[String]](
|
||||
_.inReplyToScreenName,
|
||||
(c, inReplyToScreenName) => c.copy(inReplyToScreenName = inReplyToScreenName)
|
||||
)
|
||||
|
||||
val inReplyToStatusId: Lens[Reply, Option[TweetId]] =
|
||||
checkEq[Reply, Option[TweetId]](
|
||||
_.inReplyToStatusId,
|
||||
(c, inReplyToStatusId) => c.copy(inReplyToStatusId = inReplyToStatusId)
|
||||
)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
package com.twitter.tweetypie.util
|
||||
|
||||
import com.twitter.tweetutil.TweetPermalink
|
||||
import com.twitter.tweetypie.thriftscala._
|
||||
|
||||
object TweetPermalinkUtil {
|
||||
def lastQuotedTweetPermalink(tweet: Tweet): Option[(UrlEntity, TweetPermalink)] =
|
||||
lastQuotedTweetPermalink(TweetLenses.urls.get(tweet))
|
||||
|
||||
def lastQuotedTweetPermalink(urls: Seq[UrlEntity]): Option[(UrlEntity, TweetPermalink)] =
|
||||
urls.flatMap(matchQuotedTweetPermalink).lastOption
|
||||
|
||||
def matchQuotedTweetPermalink(entity: UrlEntity): Option[(UrlEntity, TweetPermalink)] =
|
||||
for {
|
||||
expanded <- entity.expanded
|
||||
permalink <- TweetPermalink.parse(expanded)
|
||||
} yield (entity, permalink)
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue