1
0
Fork 0
mirror of https://github.com/twitter/the-algorithm.git synced 2025-01-09 22:05:51 +00:00

Open-sourcing Tweetypie

Tweetypie is the core Tweet service that handles the reading and writing of Tweet data.
This commit is contained in:
twitter-team 2023-05-12 09:11:38 -07:00
parent 90d7ea370e
commit 01dbfee4c0
591 changed files with 68352 additions and 0 deletions
README.md
tweetypie/common/src/scala/com/twitter/tweetypie
additionalfields
caching
client_id
context
decider
jiminy/tweetypie
matching
media
storage
tflock
thriftscala
tweettext
util

View file

@ -24,6 +24,7 @@ Product surfaces at Twitter are built on a shared set of data, models, and softw
| | [timelines-aggregation-framework](timelines/data_processing/ml_util/aggregation_framework/README.md) | Framework for generating aggregate features in batch or real time. |
| | [representation-manager](representation-manager/README.md) | Service to retrieve embeddings (i.e. SimClusers and TwHIN). |
| | [twml](twml/README.md) | Legacy machine learning framework built on TensorFlow v1. |
| | [Tweetypie](tweetypie/server/README.md) | Core Tweet service that handles the reading and writing of Tweet data. |
The product surface currently included in this repository is the For You Timeline.

View file

@ -0,0 +1,118 @@
package com.twitter.tweetypie.additionalfields
import com.twitter.tweetypie.thriftscala.Tweet
import com.twitter.scrooge.TFieldBlob
import com.twitter.scrooge.ThriftStructField
object AdditionalFields {
type FieldId = Short
/** additional fields really start at 100, be we are ignoring conversation id for now */
val StartAdditionalId = 101
/** all known [[Tweet]] field IDs */
val CompiledFieldIds: Seq[FieldId] = Tweet.metaData.fields.map(_.id)
/** all known [[Tweet]] fields in the "additional-field" range (excludes id) */
val CompiledAdditionalFieldMetaDatas: Seq[ThriftStructField[Tweet]] =
Tweet.metaData.fields.filter(f => isAdditionalFieldId(f.id))
val CompiledAdditionalFieldsMap: Map[Short, ThriftStructField[Tweet]] =
CompiledAdditionalFieldMetaDatas.map(field => (field.id, field)).toMap
/** all known [[Tweet]] field IDs in the "additional-field" range */
val CompiledAdditionalFieldIds: Seq[FieldId] =
CompiledAdditionalFieldsMap.keys.toSeq
/** all [[Tweet]] field IDs which should be rejected when set as additional
* fields on via PostTweetRequest.additionalFields or RetweetRequest.additionalFields */
val RejectedFieldIds: Seq[FieldId] = Seq(
// Should be provided via PostTweetRequest.conversationControl field. go/convocontrolsbackend
Tweet.ConversationControlField.id,
// This field should only be set based on whether the client sets the right community
// tweet annotation.
Tweet.CommunitiesField.id,
// This field should not be set by clients and should opt for
// [[PostTweetRequest.ExclusiveTweetControlOptions]].
// The exclusiveTweetControl field requires the userId to be set
// and we shouldn't trust the client to provide the right one.
Tweet.ExclusiveTweetControlField.id,
// This field should not be set by clients and should opt for
// [[PostTweetRequest.TrustedFriendsControlOptions]].
// The trustedFriendsControl field requires the trustedFriendsListId to be
// set and we shouldn't trust the client to provide the right one.
Tweet.TrustedFriendsControlField.id,
// This field should not be set by clients and should opt for
// [[PostTweetRequest.CollabControlOptions]].
// The collabControl field requires a list of Collaborators to be
// set and we shouldn't trust the client to provide the right one.
Tweet.CollabControlField.id
)
def isAdditionalFieldId(fieldId: FieldId): Boolean =
fieldId >= StartAdditionalId
/**
* Provides a list of all additional field IDs on the tweet, which include all
* the compiled additional fields and all the provided passthrough fields. This includes
* compiled additional fields where the value is None.
*/
def allAdditionalFieldIds(tweet: Tweet): Seq[FieldId] =
CompiledAdditionalFieldIds ++ tweet._passthroughFields.keys
/**
* Provides a list of all field IDs that have a value on the tweet which are not known compiled
* additional fields (excludes [[Tweet.id]]).
*/
def unsettableAdditionalFieldIds(tweet: Tweet): Seq[FieldId] =
CompiledFieldIds
.filter { id =>
!isAdditionalFieldId(id) && id != Tweet.IdField.id && tweet.getFieldBlob(id).isDefined
} ++
tweet._passthroughFields.keys
/**
* Provides a list of all field IDs that have a value on the tweet which are explicitly disallowed
* from being set via PostTweetRequest.additionalFields and RetweetRequest.additionalFields
*/
def rejectedAdditionalFieldIds(tweet: Tweet): Seq[FieldId] =
RejectedFieldIds
.filter { id => tweet.getFieldBlob(id).isDefined }
def unsettableAdditionalFieldIdsErrorMessage(unsettableFieldIds: Seq[FieldId]): String =
s"request may not contain fields: [${unsettableFieldIds.sorted.mkString(", ")}]"
/**
* Provides a list of all additional field IDs that have a value on the tweet,
* compiled and passthrough (excludes Tweet.id).
*/
def nonEmptyAdditionalFieldIds(tweet: Tweet): Seq[FieldId] =
CompiledAdditionalFieldMetaDatas.collect {
case f if f.getValue(tweet) != None => f.id
} ++ tweet._passthroughFields.keys
def additionalFields(tweet: Tweet): Seq[TFieldBlob] =
(tweet.getFieldBlobs(CompiledAdditionalFieldIds) ++ tweet._passthroughFields).values.toSeq
/**
* Merge base tweet with additional fields.
* Non-additional fields in the additional tweet are ignored.
* @param base: a tweet that contains basic fields
* @param additional: a tweet object that carries additional fields
*/
def setAdditionalFields(base: Tweet, additional: Tweet): Tweet =
setAdditionalFields(base, additionalFields(additional))
def setAdditionalFields(base: Tweet, additional: Option[Tweet]): Tweet =
additional.map(setAdditionalFields(base, _)).getOrElse(base)
def setAdditionalFields(base: Tweet, additional: Traversable[TFieldBlob]): Tweet =
additional.foldLeft(base) { case (t, f) => t.setField(f) }
/**
* Unsets the specified fields on the given tweet.
*/
def unsetFields(tweet: Tweet, fieldIds: Iterable[FieldId]): Tweet = {
tweet.unsetFields(fieldIds.toSet)
}
}

View file

@ -0,0 +1,15 @@
scala_library(
sources = ["*.scala"],
compiler_option_sets = ["fatal_warnings"],
platform = "java8",
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/org/apache/thrift:libthrift",
"mediaservices/commons/src/main/thrift:thrift-scala",
"scrooge/scrooge-core",
"src/thrift/com/twitter/escherbird:media-annotation-structs-scala",
"src/thrift/com/twitter/spam/rtf:safety-label-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:tweet-scala",
],
)

View file

@ -0,0 +1,15 @@
scala_library(
compiler_option_sets = ["fatal_warnings"],
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"finagle/finagle-memcached/src/main/scala",
"scrooge/scrooge-serializer",
"stitch/stitch-core",
"util/util-core",
"util/util-logging",
# CachedValue struct
"tweetypie/servo/repo/src/main/thrift:thrift-scala",
"util/util-slf4j-api/src/main/scala/com/twitter/util/logging",
],
)

View file

@ -0,0 +1,241 @@
package com.twitter.tweetypie.caching
import com.twitter.finagle.service.StatsFilter
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.finagle.stats.ExceptionStatsHandler
import com.twitter.finagle.stats.Counter
import com.twitter.util.Future
import com.twitter.util.logging.Logger
import com.twitter.finagle.memcached
import scala.util.control.NonFatal
/**
* Wrapper around a memcached client that performs serialization and
* deserialization, tracks stats, provides tracing, and provides
* per-key fresh/stale/failure/miss results.
*
* The operations that write values to cache will only write values
* that the ValueSerializer says are cacheable. The idea here is that
* the deserialize and serialize functions must be coherent, and no
* matter how you choose to write these values back to cache, the
* serializer will have the appropriate knowledge about whether the
* values are cacheable.
*
* For most cases, you will want to use [[StitchCaching]] rather than
* calling this wrapper directly.
*
* @param keySerializer How to convert a K value to a memcached key.
*
* @param valueSerializer How to serialize and deserialize V values,
* as well as which values are cacheable, and how long to store the
* values in cache.
*/
class CacheOperations[K, V](
keySerializer: K => String,
valueSerializer: ValueSerializer[V],
memcachedClient: memcached.Client,
statsReceiver: StatsReceiver,
logger: Logger,
exceptionStatsHandler: ExceptionStatsHandler = StatsFilter.DefaultExceptions) {
// The memcached operations that are performed via this
// [[CacheOperations]] instance will be tracked under this stats
// receiver.
//
// We count all memcached failures together under this scope,
// because memcached operations should not fail unless there are
// communication problems, so differentiating the method that was
// being called will not give us any useful information.
private[this] val memcachedStats: StatsReceiver = statsReceiver.scope("memcached")
// Incremented for every attempt to `get` a key from cache.
private[this] val memcachedGetCounter: Counter = memcachedStats.counter("get")
// One of these two counters is incremented for every successful
// response returned from a `get` call to memcached.
private[this] val memcachedNotFoundCounter: Counter = memcachedStats.counter("not_found")
private[this] val memcachedFoundCounter: Counter = memcachedStats.counter("found")
// Records the state of the cache load after serialization. The
// policy may transform a value that was successfully loaded from
// cache into any result type, which is why we explicitly track
// "found" and "not_found" above. If `stale` + `fresh` is not equal
// to `found`, then it means that the policy has translated a found
// value into a miss or failure. The policy may do this in order to
// cause the caching filter to treat the value that was found in
// cache in the way it would have treated a miss or failure from
// cache.
private[this] val resultStats: StatsReceiver = statsReceiver.scope("result")
private[this] val resultFreshCounter: Counter = resultStats.counter("fresh")
private[this] val resultStaleCounter: Counter = resultStats.counter("stale")
private[this] val resultMissCounter: Counter = resultStats.counter("miss")
private[this] val resultFailureCounter: Counter = resultStats.counter("failure")
// Used for recording exceptions that occurred during
// deserialization. This will never be incremented if the
// deserializer returns a result, even if the result is a
// [[CacheResult.Failure]]. See the comment where this stat is
// incremented for more details.
private[this] val deserializeFailureStats: StatsReceiver = statsReceiver.scope("deserialize")
private[this] val notSerializedCounter: Counter = statsReceiver.counter("not_serialized")
/**
* Load a batch of values from cache. Mostly this deals with
* converting the [[memcached.GetResult]] to a
* [[Seq[CachedResult[V]]]]. The result is in the same order as the
* keys, and there will always be an entry for each key. This method
* should never return a [[Future.exception]].
*/
def get(keys: Seq[K]): Future[Seq[CacheResult[V]]] = {
memcachedGetCounter.incr(keys.size)
val cacheKeys: Seq[String] = keys.map(keySerializer)
if (logger.isTraceEnabled) {
logger.trace {
val lines: Seq[String] = keys.zip(cacheKeys).map { case (k, c) => s"\n $k ($c)" }
"Starting load for keys:" + lines.mkString
}
}
memcachedClient
.getResult(cacheKeys)
.map { getResult =>
memcachedNotFoundCounter.incr(getResult.misses.size)
val results: Seq[CacheResult[V]] =
cacheKeys.map { cacheKey =>
val result: CacheResult[V] =
getResult.hits.get(cacheKey) match {
case Some(memcachedValue) =>
memcachedFoundCounter.incr()
try {
valueSerializer.deserialize(memcachedValue.value)
} catch {
case NonFatal(e) =>
// If the serializer throws an exception, then
// the serialized value was malformed. In that
// case, we record the failure so that it can be
// detected and fixed, but treat it as a cache
// miss. The reason that we treat it as a miss
// rather than a failure is that a miss will
// cause a write back to cache, and we want to
// write a valid result back to cache to replace
// the bad entry that we just loaded.
//
// A serializer is free to return Miss itself to
// obtain this behavior if it is expected or
// desired, to avoid the logging and stats (and
// the minor overhead of catching an exception).
//
// The exceptions are tracked separately from
// other exceptions so that it is easy to see
// whether the deserializer itself ever throws an
// exception.
exceptionStatsHandler.record(deserializeFailureStats, e)
logger.warn(s"Failed deserializing value for cache key $cacheKey", e)
CacheResult.Miss
}
case None if getResult.misses.contains(cacheKey) =>
CacheResult.Miss
case None =>
val exception =
getResult.failures.get(cacheKey) match {
case None =>
// To get here, this was not a hit or a miss,
// so we expect the key to be present in
// failures. If it is not, then either the
// contract of getResult was violated, or this
// method is somehow attempting to access a
// result for a key that was not
// loaded. Either of these indicates a bug, so
// we log a high priority log message.
logger.error(
s"Key $cacheKey not found in hits, misses or failures. " +
"This indicates a bug in the memcached library or " +
"CacheOperations.load"
)
// We return this as a failure because that
// will cause the repo to be consulted and the
// value *not* to be written back to cache,
// which is probably the safest thing to do
// (if we don't know what's going on, default
// to an uncached repo).
new IllegalStateException
case Some(e) =>
e
}
exceptionStatsHandler.record(memcachedStats, exception)
CacheResult.Failure(exception)
}
// Count each kind of CacheResult, to make it possible to
// see how effective the caching is.
result match {
case CacheResult.Fresh(_) => resultFreshCounter.incr()
case CacheResult.Stale(_) => resultStaleCounter.incr()
case CacheResult.Miss => resultMissCounter.incr()
case CacheResult.Failure(_) => resultFailureCounter.incr()
}
result
}
if (logger.isTraceEnabled) {
logger.trace {
val lines: Seq[String] =
(keys, cacheKeys, results).zipped.map {
case (key, cacheKey, result) => s"\n $key ($cacheKey) -> $result"
}
"Cache results:" + lines.mkString
}
}
results
}
.handle {
case e =>
// If there is a failure from the memcached client, fan it
// out to each cache key, so that the caller does not need
// to handle failure of the batch differently than failure
// of individual keys. This should be rare anyway, since the
// memcached client already does this for common Finagle
// exceptions
resultFailureCounter.incr(keys.size)
val theFailure: CacheResult[V] = CacheResult.Failure(e)
keys.map { _ =>
// Record this as many times as we would if it were in the GetResult
exceptionStatsHandler.record(memcachedStats, e)
theFailure
}
}
}
// Incremented for every attempt to `set` a key in value.
private[this] val memcachedSetCounter: Counter = memcachedStats.counter("set")
/**
* Write an entry back to cache, using `set`. If the serializer does
* not serialize the value, then this method will immediately return
* with success.
*/
def set(key: K, value: V): Future[Unit] =
valueSerializer.serialize(value) match {
case Some((expiry, serialized)) =>
if (logger.isTraceEnabled) {
logger.trace(s"Writing back to cache $key -> $value (expiry = $expiry)")
}
memcachedSetCounter.incr()
memcachedClient
.set(key = keySerializer(key), flags = 0, expiry = expiry, value = serialized)
.onFailure(exceptionStatsHandler.record(memcachedStats, _))
case None =>
if (logger.isTraceEnabled) {
logger.trace(s"Not writing back $key -> $value")
}
notSerializedCounter.incr()
Future.Done
}
}

View file

@ -0,0 +1,45 @@
package com.twitter.tweetypie.caching
/**
* Encodes the possible states of a value loaded from memcached.
*
* @see [[ValueSerializer]] and [[CacheOperations]]
*/
sealed trait CacheResult[+V]
object CacheResult {
/**
* Signals that the value could not be successfully loaded from
* cache. `Failure` values should not be written back to cache.
*
* This value may result from an error talking to the memcached
* instance or it may be returned from the Serializer when the value
* should not be reused, but should also not be overwritten.
*/
final case class Failure(e: Throwable) extends CacheResult[Nothing]
/**
* Signals that the cache load attempt was successful, but there was
* not a usable value.
*
* When processing a `Miss`, the value should be written back to
* cache if it loads successfully.
*/
case object Miss extends CacheResult[Nothing]
/**
* Signals that the value was found in cache.
*
* It is not necessary to load the value from the original source.
*/
case class Fresh[V](value: V) extends CacheResult[V]
/**
* Signals that the value was found in cache.
*
* This value should be used, but it should be refreshed
* out-of-band.
*/
case class Stale[V](value: V) extends CacheResult[V]
}

View file

@ -0,0 +1,34 @@
package com.twitter.tweetypie.caching
import com.twitter.util.Duration
import com.twitter.util.Time
/**
* Helpers for creating common expiry functions.
*
* An expiry function maps from the value to a time in the future when
* the value should expire from cache. These are useful in the
* implementation of a [[ValueSerializer]].
*/
object Expiry {
/**
* Return a time that indicates to memcached to never expire this
* value.
*
* This function takes [[Any]] so that it can be used at any value
* type, since it doesn't examine the value at all.
*/
val Never: Any => Time =
_ => Time.Top
/**
* Return function that indicates to memcached that the value should
* not be used after the `ttl` has elapsed.
*
* This function takes [[Any]] so that it can be used at any value
* type, since it doesn't examine the value at all.
*/
def byAge(ttl: Duration): Any => Time =
_ => Time.now + ttl
}

View file

@ -0,0 +1,140 @@
package com.twitter.tweetypie.caching
import com.twitter.io.Buf
import com.twitter.scrooge.CompactThriftSerializer
import com.twitter.scrooge.ThriftStruct
import com.twitter.scrooge.ThriftStructCodec
import com.twitter.servo.cache.thriftscala.CachedValue
import com.twitter.servo.cache.thriftscala.CachedValueStatus
import com.twitter.stitch.NotFound
import com.twitter.util.Return
import com.twitter.util.Throw
import com.twitter.util.Time
import com.twitter.util.Try
import java.nio.ByteBuffer
object ServoCachedValueSerializer {
/**
* Thrown when the fields of the servo CachedValue struct do not
* satisfy the invariants expected by this serialization code.
*/
case class UnexpectedCachedValueState(cachedValue: CachedValue) extends Exception {
def message: String = s"Unexpected state for CachedValue. Value was: $cachedValue"
}
val CachedValueThriftSerializer: CompactThriftSerializer[CachedValue] = CompactThriftSerializer(
CachedValue)
}
/**
* A [[ValueSerializer]] that is compatible with the use of
* Servo's [[CachedValue]] struct by tweetypie:
*
* - The only [[CachedValueStatus]] values that are cacheable are
* [[CachedValueStatus.Found]] and [[CachedValueStatus.NotFound]].
*
* - We only track the `cachedAtMsec` field, because tweetypie's cache
* interaction does not use the other fields, and the values that
* are cached this way are never updated, so storing readThroughAt
* or writtenThroughAt would not add any information.
*
* - When values are present, they are serialized using
* [[org.apache.thrift.protocol.TCompactProtocol]].
*
* - The CachedValue struct itself is also serialized using TCompactProtocol.
*
* The serializer operates on [[Try]] values and will cache [[Return]]
* and `Throw(NotFound)` values.
*/
case class ServoCachedValueSerializer[V <: ThriftStruct](
codec: ThriftStructCodec[V],
expiry: Try[V] => Time,
softTtl: SoftTtl[Try[V]])
extends ValueSerializer[Try[V]] {
import ServoCachedValueSerializer.UnexpectedCachedValueState
import ServoCachedValueSerializer.CachedValueThriftSerializer
private[this] val ValueThriftSerializer = CompactThriftSerializer(codec)
/**
* Return an expiry based on the value and a
* TCompactProtocol-encoded servo CachedValue struct with the
* following fields defined:
*
* - `value`: [[None]]
* for {{{Throw(NotFound)}}, {{{Some(encodedStruct)}}} for
* [[Return]], where {{{encodedStruct}}} is a
* TCompactProtocol-encoding of the value inside of the Return.
*
* - `status`: [[CachedValueStatus.Found]] if the value is Return,
* and [[CachedValueStatus.NotFound]] if it is Throw(NotFound)
*
* - `cachedAtMsec`: The current time, accoring to [[Time.now]]
*
* No other fields will be defined.
*
* @throws IllegalArgumentException if called with a value that
* should not be cached.
*/
override def serialize(value: Try[V]): Option[(Time, Buf)] = {
def serializeCachedValue(payload: Option[ByteBuffer]) = {
val cachedValue = CachedValue(
value = payload,
status = if (payload.isDefined) CachedValueStatus.Found else CachedValueStatus.NotFound,
cachedAtMsec = Time.now.inMilliseconds)
val serialized = Buf.ByteArray.Owned(CachedValueThriftSerializer.toBytes(cachedValue))
(expiry(value), serialized)
}
value match {
case Throw(NotFound) =>
Some(serializeCachedValue(None))
case Return(struct) =>
val payload = Some(ByteBuffer.wrap(ValueThriftSerializer.toBytes(struct)))
Some(serializeCachedValue(payload))
case _ =>
None
}
}
/**
* Deserializes values serialized by [[serializeValue]]. The
* value will be [[CacheResult.Fresh]] or [[CacheResult.Stale]]
* depending on the result of {{{softTtl.isFresh}}}.
*
* @throws UnexpectedCachedValueState if the state of the
* [[CachedValue]] could not be produced by [[serialize]]
*/
override def deserialize(buf: Buf): CacheResult[Try[V]] = {
val cachedValue = CachedValueThriftSerializer.fromBytes(Buf.ByteArray.Owned.extract(buf))
val hasValue = cachedValue.value.isDefined
val isValid =
(hasValue && cachedValue.status == CachedValueStatus.Found) ||
(!hasValue && cachedValue.status == CachedValueStatus.NotFound)
if (!isValid) {
// Exceptions thrown by deserialization are recorded and treated
// as a cache miss by CacheOperations, so throwing this
// exception will cause the value in cache to be
// overwritten. There will be stats recorded whenever this
// happens.
throw UnexpectedCachedValueState(cachedValue)
}
val value =
cachedValue.value match {
case Some(valueBuffer) =>
val valueBytes = new Array[Byte](valueBuffer.remaining)
valueBuffer.duplicate.get(valueBytes)
Return(ValueThriftSerializer.fromBytes(valueBytes))
case None =>
Throw(NotFound)
}
softTtl.toCacheResult(value, Time.fromMilliseconds(cachedValue.cachedAtMsec))
}
}

View file

@ -0,0 +1,120 @@
package com.twitter.tweetypie.caching
import com.twitter.util.Duration
import com.twitter.util.Time
import scala.util.Random
import com.twitter.logging.Logger
/**
* Used to determine whether values successfully retrieved from cache
* are [[CacheResult.Fresh]] or [[CacheResult.Stale]]. This is useful
* in the implementation of a [[ValueSerializer]].
*/
trait SoftTtl[-V] {
/**
* Determines whether a cached value was fresh.
*
* @param cachedAt the time at which the value was cached.
*/
def isFresh(value: V, cachedAt: Time): Boolean
/**
* Wraps the value in Fresh or Stale depending on the value of `isFresh`.
*
* (The type variable U exists because it is not allowed to return
* values of a contravariant type, so we must define a variable that
* is a specific subclass of V. This is worth it because it allows
* us to create polymorphic policies without having to specify the
* type. Another solution would be to make the type invariant, but
* then we would have to specify the type whenever we create an
* instance.)
*/
def toCacheResult[U <: V](value: U, cachedAt: Time): CacheResult[U] =
if (isFresh(value, cachedAt)) CacheResult.Fresh(value) else CacheResult.Stale(value)
}
object SoftTtl {
/**
* Regardless of the inputs, the value will always be considered
* fresh.
*/
object NeverRefresh extends SoftTtl[Any] {
override def isFresh(_unusedValue: Any, _unusedCachedAt: Time): Boolean = true
}
/**
* Trigger refresh based on the length of time that a value has been
* stored in cache, ignoring the value.
*
* @param softTtl Items that were cached longer ago than this value
* will be refreshed when they are accessed.
*
* @param jitter Add nondeterminism to the soft TTL to prevent a
* thundering herd of requests refreshing the value at the same
* time. The time at which the value is considered stale will be
* uniformly spread out over a range of +/- (jitter/2). It is
* valid to set the jitter to zero, which will turn off jittering.
*
* @param logger If non-null, use this logger rather than one based
* on the class name. This logger is only used for trace-level
* logging.
*/
case class ByAge[V](
softTtl: Duration,
jitter: Duration,
specificLogger: Logger = null,
rng: Random = Random)
extends SoftTtl[Any] {
private[this] val logger: Logger =
if (specificLogger == null) Logger(getClass) else specificLogger
private[this] val maxJitterMs: Long = jitter.inMilliseconds
// this requirement is due to using Random.nextInt to choose the
// jitter, but it allows jitter of greater than 24 days
require(maxJitterMs <= (Int.MaxValue / 2))
// Negative jitter probably indicates misuse of the API
require(maxJitterMs >= 0)
// we want period +/- jitter, but the random generator
// generates non-negative numbers, so we generate [0, 2 *
// maxJitter) and subtract maxJitter to obtain [-maxJitter,
// maxJitter)
private[this] val maxJitterRangeMs: Int = (maxJitterMs * 2).toInt
// We perform all calculations in milliseconds, so convert the
// period to milliseconds out here.
private[this] val softTtlMs: Long = softTtl.inMilliseconds
// If the value is below this age, it will always be fresh,
// regardless of jitter.
private[this] val alwaysFreshAgeMs: Long = softTtlMs - maxJitterMs
// If the value is above this age, it will always be stale,
// regardless of jitter.
private[this] val alwaysStaleAgeMs: Long = softTtlMs + maxJitterMs
override def isFresh(value: Any, cachedAt: Time): Boolean = {
val ageMs: Long = (Time.now - cachedAt).inMilliseconds
val fresh =
if (ageMs <= alwaysFreshAgeMs) {
true
} else if (ageMs > alwaysStaleAgeMs) {
false
} else {
val jitterMs: Long = rng.nextInt(maxJitterRangeMs) - maxJitterMs
ageMs <= softTtlMs + jitterMs
}
logger.ifTrace(
s"Checked soft ttl: fresh = $fresh, " +
s"soft_ttl_ms = $softTtlMs, age_ms = $ageMs, value = $value")
fresh
}
}
}

View file

@ -0,0 +1,65 @@
package com.twitter.tweetypie.caching
import scala.collection.mutable
import com.twitter.util.Future
import com.twitter.stitch.Stitch
import com.twitter.stitch.Runner
import com.twitter.stitch.FutureRunner
import com.twitter.stitch.Group
/**
* Workaround for a infelicity in the implementation of [[Stitch.async]].
*
* This has the same semantics to [[Stitch.async]], with the exception
* that interrupts to the main computation will not interrupt the
* async call.
*
* The problem that this implementation solves is that we do not want
* async calls grouped together with synchronous calls. See the
* mailing list thread [1] for discussion. This may eventually be
* fixed in Stitch.
*/
private[caching] object StitchAsync {
// Contains a deferred Stitch that we want to run asynchronously
private[this] class AsyncCall(deferred: => Stitch[_]) {
def call(): Stitch[_] = deferred
}
private object AsyncGroup extends Group[AsyncCall, Unit] {
override def runner(): Runner[AsyncCall, Unit] =
new FutureRunner[AsyncCall, Unit] {
// All of the deferred calls of any type. When they are
// executed in `run`, the normal Stitch batching and deduping
// will occur.
private[this] val calls = new mutable.ArrayBuffer[AsyncCall]
def add(call: AsyncCall): Stitch[Unit] = {
// Just remember the deferred call.
calls.append(call)
// Since we don't wait for the completion of the effect,
// just return a constant value.
Stitch.Unit
}
def run(): Future[_] = {
// The future returned from this innter invocation of
// Stitch.run is not linked to the returned future, so these
// effects are not linked to the outer Run in which this
// method was invoked.
Stitch.run {
Stitch.traverse(calls) { asyncCall: AsyncCall =>
asyncCall
.call()
.liftToTry // So that an exception will not interrupt the other calls
}
}
Future.Unit
}
}
}
def apply(call: => Stitch[_]): Stitch[Unit] =
// Group together all of the async calls
Stitch.call(new AsyncCall(call), AsyncGroup)
}

View file

@ -0,0 +1,62 @@
package com.twitter.tweetypie.caching
import com.twitter.stitch.MapGroup
import com.twitter.stitch.Group
import com.twitter.stitch.Stitch
import com.twitter.util.Future
import com.twitter.util.Return
import com.twitter.util.Try
/**
* Wrapper around [[CacheOperations]] providing a [[Stitch]] API.
*/
case class StitchCacheOperations[K, V](operations: CacheOperations[K, V]) {
import StitchCacheOperations.SetCall
private[this] val getGroup: Group[K, CacheResult[V]] =
MapGroup[K, CacheResult[V]] { keys: Seq[K] =>
operations
.get(keys)
.map(values => keys.zip(values).toMap.mapValues(Return(_)))
}
def get(key: K): Stitch[CacheResult[V]] =
Stitch.call(key, getGroup)
private[this] val setGroup: Group[SetCall[K, V], Unit] =
new MapGroup[SetCall[K, V], Unit] {
override def run(calls: Seq[SetCall[K, V]]): Future[SetCall[K, V] => Try[Unit]] =
Future
.collectToTry(calls.map(call => operations.set(call.key, call.value)))
.map(tries => calls.zip(tries).toMap)
}
/**
* Performs a [[CacheOperations.set]].
*/
def set(key: K, value: V): Stitch[Unit] =
// This is implemented as a Stitch.call instead of a Stitch.future
// in order to handle the case where a batch has a duplicate
// key. Each copy of the duplicate key will trigger a write back
// to cache, so we dedupe the writes in order to avoid the
// extraneous RPC call.
Stitch.call(new StitchCacheOperations.SetCall(key, value), setGroup)
}
object StitchCacheOperations {
/**
* Used as the "call" for [[SetGroup]]. This is essentially a tuple
* where equality is defined only by the key.
*/
private class SetCall[K, V](val key: K, val value: V) {
override def equals(other: Any): Boolean =
other match {
case setCall: SetCall[_, _] => key == setCall.key
case _ => false
}
override def hashCode: Int = key.hashCode
}
}

View file

@ -0,0 +1,36 @@
package com.twitter.tweetypie.caching
import com.twitter.stitch.Stitch
/**
* Apply caching to a [[Stitch]] function.
*
* @see CacheResult for more information about the semantics
* implemented here.
*/
class StitchCaching[K, V](operations: CacheOperations[K, V], repo: K => Stitch[V])
extends (K => Stitch[V]) {
private[this] val stitchOps = new StitchCacheOperations(operations)
override def apply(key: K): Stitch[V] =
stitchOps.get(key).flatMap {
case CacheResult.Fresh(value) =>
Stitch.value(value)
case CacheResult.Stale(staleValue) =>
StitchAsync(repo(key).flatMap(refreshed => stitchOps.set(key, refreshed)))
.map(_ => staleValue)
case CacheResult.Miss =>
repo(key)
.applyEffect(value => StitchAsync(stitchOps.set(key, value)))
case CacheResult.Failure(_) =>
// In the case of failure, we don't attempt to write back to
// cache, because cache failure usually means communication
// failure, and sending more requests to the cache that holds
// the value for this key could make the situation worse.
repo(key)
}
}

View file

@ -0,0 +1,47 @@
package com.twitter.tweetypie.caching
import com.twitter.io.Buf
import com.twitter.util.Time
/**
* How to store values of type V in cache. This includes whether a
* given value is cacheable, how to serialize it, when it should
* expire from cache, and how to interpret byte patterns from cache.
*/
trait ValueSerializer[V] {
/**
* Prepare the value for storage in cache. When a [[Some]] is
* returned, the [[Buf]] should be a valid input to [[deserialize]]
* and the [[Time]] will be used as the expiry in the memcached
* command. When [[None]] is returned, it indicates that the value
* cannot or should not be written back to cache.
*
* The most common use case for returning None is caching Try
* values, where certain exceptional values encode a cacheable state
* of a value. In particular, Throw(NotFound) is commonly used to
* encode a missing value, and we usually want to cache those
* negative lookups, but we don't want to cache e.g. a timeout
* exception.
*
* @return a pair of expiry time for this cache entry and the bytes
* to store in cache. If you do not want this value to explicitly
* expire, use Time.Top as the expiry.
*/
def serialize(value: V): Option[(Time, Buf)]
/**
* Deserialize a value found in cache. This function converts the
* bytes found in memcache to a [[CacheResult]]. In general, you
* probably want to return [[CacheResult.Fresh]] or
* [[CacheResult.Stale]], but you are free to return any of the
* range of [[CacheResult]]s, depending on the behavior that you
* want.
*
* This is a total function because in the common use case, the
* bytes stored in cache will be appropriate for the
* serializer. This method is free to throw any exception if the
* bytes are not valid.
*/
def deserialize(serializedValue: Buf): CacheResult[V]
}

View file

@ -0,0 +1,15 @@
scala_library(
sources = ["*.scala"],
compiler_option_sets = ["fatal_warnings"],
platform = "java8",
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication",
"finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/transport",
"finagle/finagle-thrift/src/main/scala",
"tweetypie/servo/util/src/main/scala:exception",
"strato/src/main/scala/com/twitter/strato/access",
"strato/src/main/scala/com/twitter/strato/data",
],
)

View file

@ -0,0 +1,185 @@
package com.twitter.tweetypie.client_id
import com.twitter.finagle.mtls.authentication.EmptyServiceIdentifier
import com.twitter.finagle.mtls.authentication.ServiceIdentifier
import com.twitter.finagle.mtls.transport.S2STransport
import com.twitter.finagle.thrift.ClientId
import com.twitter.servo.util.Gate
import com.twitter.strato.access.Access
import com.twitter.strato.access.Access.ForwardedServiceIdentifier
object ClientIdHelper {
val UnknownClientId = "unknown"
def default: ClientIdHelper = new ClientIdHelper(UseTransportServiceIdentifier)
/**
* Trims off the last .element, which is usually .prod or .staging
*/
def getClientIdRoot(clientId: String): String =
clientId.lastIndexOf('.') match {
case -1 => clientId
case idx => clientId.substring(0, idx)
}
/**
* Returns the last .element without the '.'
*/
def getClientIdEnv(clientId: String): String =
clientId.lastIndexOf('.') match {
case -1 => clientId
case idx => clientId.substring(idx + 1)
}
private[client_id] def asClientId(s: ServiceIdentifier): String = s"${s.service}.${s.environment}"
}
class ClientIdHelper(serviceIdentifierStrategy: ServiceIdentifierStrategy) {
private[client_id] val ProcessPathPrefix = "/p/"
/**
* The effective client id is used for request authorization and metrics
* attribution. For calls to Tweetypie's thrift API, the thrift ClientId
* is used and is expected in the form of "service-name.env". Federated
* Strato clients don't support configured ClientIds and instead provide
* a "process path" containing instance-specific information. So for
* calls to the federated API, we compute an effective client id from
* the ServiceIdentifier, if present, in Strato's Access principles. The
* implementation avoids computing this identifier unless necessary,
* since this method is invoked on every request.
*/
def effectiveClientId: Option[String] = {
val clientId: Option[String] = ClientId.current.map(_.name)
clientId
// Exclude process paths because they are instance-specific and aren't
// supported by tweetypie for authorization or metrics purposes.
.filterNot(_.startsWith(ProcessPathPrefix))
// Try computing a value from the ServiceId if the thrift
// ClientId is undefined or unsupported.
.orElse(serviceIdentifierStrategy.serviceIdentifier.map(ClientIdHelper.asClientId))
// Ultimately fall back to the ClientId value, even when given an
// unsupported format, so that error text and debug logs include
// the value passed by the caller.
.orElse(clientId)
}
def effectiveClientIdRoot: Option[String] = effectiveClientId.map(ClientIdHelper.getClientIdRoot)
def effectiveServiceIdentifier: Option[ServiceIdentifier] =
serviceIdentifierStrategy.serviceIdentifier
}
/** Logic how to find a [[ServiceIdentifier]] for the purpose of crafting a client ID. */
trait ServiceIdentifierStrategy {
def serviceIdentifier: Option[ServiceIdentifier]
/**
* Returns the only element of given [[Set]] or [[None]].
*
* This utility is used defensively against a set of principals collected
* from [[Access.getPrincipals]]. While the contract is that there should be at most one
* instance of each principal kind present in that set, in practice that has not been the case
* always. The safest strategy to in that case is to abandon a set completely if more than
* one principals are competing.
*/
final protected def onlyElement[T](set: Set[T]): Option[T] =
if (set.size <= 1) {
set.headOption
} else {
None
}
}
/**
* Picks [[ServiceIdentifier]] from Finagle SSL Transport, if one exists.
*
* This works for both Thrift API calls as well as StratoFed API calls. Strato's
* [[Access#getPrincipals]] collection, which would typically be consulted by StratoFed
* column logic, contains the same [[ServiceIdentifier]] derived from the Finagle SSL
* transport, so there's no need to have separate strategies for Thrift vs StratoFed
* calls.
*
* This is the default behavior of using [[ServiceIdentifier]] for computing client ID.
*/
private[client_id] class UseTransportServiceIdentifier(
// overridable for testing
getPeerServiceIdentifier: => ServiceIdentifier,
) extends ServiceIdentifierStrategy {
override def serviceIdentifier: Option[ServiceIdentifier] =
getPeerServiceIdentifier match {
case EmptyServiceIdentifier => None
case si => Some(si)
}
}
object UseTransportServiceIdentifier
extends UseTransportServiceIdentifier(S2STransport.peerServiceIdentifier)
/**
* Picks [[ForwardedServiceIdentifier]] from Strato principals for client ID
* if [[ServiceIdentifier]] points at call coming from Strato.
* If not present, falls back to [[UseTransportServiceIdentifier]] behavior.
*
* Tweetypie utilizes the strategy to pick [[ServiceIdentifier]] for the purpose
* of generating a client ID when the client ID is absent or unknown.
* [[PreferForwardedServiceIdentifierForStrato]] looks for the [[ForwardedServiceIdentifier]]
* values set by stratoserver request.
* The reason is, stratoserver is effectively a conduit, forwarding the [[ServiceIdentifier]]
* of the _actual client_ that is calling stratoserver.
* Any direct callers not going through stratoserver will default to [[ServiceIdentfier]].
*/
private[client_id] class PreferForwardedServiceIdentifierForStrato(
// overridable for testing
getPeerServiceIdentifier: => ServiceIdentifier,
) extends ServiceIdentifierStrategy {
val useTransportServiceIdentifier =
new UseTransportServiceIdentifier(getPeerServiceIdentifier)
override def serviceIdentifier: Option[ServiceIdentifier] =
useTransportServiceIdentifier.serviceIdentifier match {
case Some(serviceIdentifier) if isStrato(serviceIdentifier) =>
onlyElement(
Access.getPrincipals
.collect {
case forwarded: ForwardedServiceIdentifier =>
forwarded.serviceIdentifier.serviceIdentifier
}
).orElse(useTransportServiceIdentifier.serviceIdentifier)
case other => other
}
/**
* Strato uses various service names like "stratoserver" and "stratoserver-patient".
* They all do start with "stratoserver" though, so at the point of implementing,
* the safest bet to recognize strato is to look for this prefix.
*
* This also works for staged strato instances (which it should), despite allowing
* for technically any caller to force this strategy, by creating service certificate
* with this service name.
*/
private def isStrato(serviceIdentifier: ServiceIdentifier): Boolean =
serviceIdentifier.service.startsWith("stratoserver")
}
object PreferForwardedServiceIdentifierForStrato
extends PreferForwardedServiceIdentifierForStrato(S2STransport.peerServiceIdentifier)
/**
* [[ServiceIdentifierStrategy]] which dispatches between two delegates based on the value
* of a unitary decider every time [[serviceIdentifier]] is called.
*/
class ConditionalServiceIdentifierStrategy(
private val condition: Gate[Unit],
private val ifTrue: ServiceIdentifierStrategy,
private val ifFalse: ServiceIdentifierStrategy)
extends ServiceIdentifierStrategy {
override def serviceIdentifier: Option[ServiceIdentifier] =
if (condition()) {
ifTrue.serviceIdentifier
} else {
ifFalse.serviceIdentifier
}
}

View file

@ -0,0 +1,19 @@
scala_library(
sources = ["*.scala"],
compiler_option_sets = ["fatal_warnings"],
platform = "java8",
provides = scala_artifact(
org = "com.twitter.tweetypie",
name = "context",
repo = artifactory,
),
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"finagle/finagle-core/src/main",
"graphql/common/src/main/scala/com/twitter/graphql/common/core",
"src/thrift/com/twitter/context:twitter-context-scala",
"twitter-context/src/main/scala",
"util/util-core:scala",
],
)

View file

@ -0,0 +1,135 @@
package com.twitter.tweetypie.context
import com.twitter.context.TwitterContext
import com.twitter.finagle.Filter
import com.twitter.finagle.Service
import com.twitter.finagle.SimpleFilter
import com.twitter.finagle.context.Contexts
import com.twitter.io.Buf
import com.twitter.io.Buf.ByteArray.Owned
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.graphql.common.core.GraphQlClientApplication
import com.twitter.util.Try
import java.nio.charset.StandardCharsets.UTF_8
import scala.util.matching.Regex
/**
* Context and filters to help track callers of Tweetypie's endpoints. This context and its
* filters were originally added to provide visibility into callers of Tweetypie who are
* using the birdherd library to access tweets.
*
* This context data is intended to be marshalled by callers to Tweetypie, but then the
* context data is stripped (moved from broadcast to local). This happens so that the
* context data is not forwarded down tweetypie's backend rpc chains, which often result
* in transitive calls back into tweetypie. This effectively creates single-hop marshalling.
*/
object TweetypieContext {
// Bring Tweetypie permitted TwitterContext into scope
val TwitterContext: TwitterContext =
com.twitter.context.TwitterContext(com.twitter.tweetypie.TwitterContextPermit)
case class Ctx(via: String)
val Empty = Ctx("")
object Broadcast {
private[this] object Key extends Contexts.broadcast.Key[Ctx](id = Ctx.getClass.getName) {
override def marshal(value: Ctx): Buf =
Owned(value.via.getBytes(UTF_8))
override def tryUnmarshal(buf: Buf): Try[Ctx] =
Try(Ctx(new String(Owned.extract(buf), UTF_8)))
}
private[TweetypieContext] def current(): Option[Ctx] =
Contexts.broadcast.get(Key)
def currentOrElse(default: Ctx): Ctx =
current().getOrElse(default)
def letClear[T](f: => T): T =
Contexts.broadcast.letClear(Key)(f)
def let[T](ctx: Ctx)(f: => T): T =
if (Empty == ctx) {
letClear(f)
} else {
Contexts.broadcast.let(Key, ctx)(f)
}
// ctx has to be by name so we can re-evaluate it for every request (for usage in ServiceTwitter.scala)
def filter(ctx: => Ctx): Filter.TypeAgnostic =
new Filter.TypeAgnostic {
override def toFilter[Req, Rep]: Filter[Req, Rep, Req, Rep] =
(request: Req, service: Service[Req, Rep]) => Broadcast.let(ctx)(service(request))
}
}
object Local {
private[this] val Key =
new Contexts.local.Key[Ctx]
private[TweetypieContext] def let[T](ctx: Option[Ctx])(f: => T): T =
ctx match {
case Some(ctx) if ctx != Empty => Contexts.local.let(Key, ctx)(f)
case None => Contexts.local.letClear(Key)(f)
}
def current(): Option[Ctx] =
Contexts.local.get(Key)
def filter[Req, Rep]: SimpleFilter[Req, Rep] =
(request: Req, service: Service[Req, Rep]) => {
val ctx = Broadcast.current()
Broadcast.letClear(Local.let(ctx)(service(request)))
}
private[this] def clientAppIdToName(clientAppId: Long) =
GraphQlClientApplication.AllById.get(clientAppId).map(_.name).getOrElse("nonTOO")
private[this] val pathRegexes: Seq[(Regex, String)] = Seq(
("timeline_conversation_.*_json".r, "timeline_conversation__slug__json"),
("user_timeline_.*_json".r, "user_timeline__user__json"),
("[0-9]{2,}".r, "_id_")
)
// `context.via` will either be a string like: "birdherd" or "birdherd:/1.1/statuses/show/123.json,
// depending on whether birdherd code was able to determine the path of the request.
private[this] def getViaAndPath(via: String): (String, Option[String]) =
via.split(":", 2) match {
case Array(via, path) =>
val sanitizedPath = path
.replace('/', '_')
.replace('.', '_')
// Apply each regex in turn
val normalizedPath = pathRegexes.foldLeft(sanitizedPath) {
case (path, (regex, replacement)) => regex.replaceAllIn(path, replacement)
}
(via, Some(normalizedPath))
case Array(via) => (via, None)
}
def trackStats[U](scopes: StatsReceiver*): Unit =
for {
tweetypieCtx <- TweetypieContext.Local.current()
(via, pathOpt) = getViaAndPath(tweetypieCtx.via)
twitterCtx <- TwitterContext()
clientAppId <- twitterCtx.clientApplicationId
} yield {
val clientAppName = clientAppIdToName(clientAppId)
scopes.foreach { stats =>
val ctxStats = stats.scope("context")
val viaStats = ctxStats.scope("via", via)
viaStats.scope("all").counter("requests").incr()
val viaClientStats = viaStats.scope("by_client", clientAppName)
viaClientStats.counter("requests").incr()
pathOpt.foreach { path =>
val viaPathStats = viaStats.scope("by_path", path)
viaPathStats.counter("requests").incr()
}
}
}
}
}

View file

@ -0,0 +1,15 @@
scala_library(
sources = ["DeciderGates.scala"],
compiler_option_sets = ["fatal_warnings"],
platform = "java8",
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/guava",
"decider",
"finagle/finagle-toggle/src/main/scala/com/twitter/finagle/server",
"tweetypie/servo/decider",
"tweetypie/servo/util/src/main/scala",
"util/util-core:scala",
],
)

View file

@ -0,0 +1,60 @@
package com.twitter.tweetypie
package decider
import com.google.common.hash.Hashing
import com.twitter.decider.Decider
import com.twitter.decider.Feature
import com.twitter.servo.gate.DeciderGate
import com.twitter.servo.util.Gate
import java.nio.charset.StandardCharsets
import scala.collection.mutable
trait DeciderGates {
def overrides: Map[String, Boolean] = Map.empty
def decider: Decider
def prefix: String
protected val seenFeatures: mutable.HashSet[String] = new mutable.HashSet[String]
private def deciderFeature(name: String): Feature = {
decider.feature(prefix + "_" + name)
}
def withOverride[T](name: String, mkGate: Feature => Gate[T]): Gate[T] = {
seenFeatures += name
overrides.get(name).map(Gate.const).getOrElse(mkGate(deciderFeature(name)))
}
protected def linear(name: String): Gate[Unit] = withOverride[Unit](name, DeciderGate.linear)
protected def byId(name: String): Gate[Long] = withOverride[Long](name, DeciderGate.byId)
/**
* It returns a Gate[String] that can be used to check availability of the feature.
* The string is hashed into a Long and used as an "id" and then used to call servo's
* DeciderGate.byId
*
* @param name decider name
* @return Gate[String]
*/
protected def byStringId(name: String): Gate[String] =
byId(name).contramap { s: String =>
Hashing.sipHash24().hashString(s, StandardCharsets.UTF_8).asLong()
}
def all: Traversable[String] = seenFeatures
def unusedOverrides: Set[String] = overrides.keySet.diff(all.toSet)
/**
* Generate a map of name -> availability, taking into account overrides.
* Overrides are either on or off so map to 10000 or 0, respectively.
*/
def availabilityMap: Map[String, Option[Int]] =
all.map { name =>
val availability: Option[Int] = overrides
.get(name)
.map(on => if (on) 10000 else 0)
.orElse(deciderFeature(name).availability)
name -> availability
}.toMap
}

View file

@ -0,0 +1,10 @@
scala_library(
sources = ["*.scala"],
compiler_option_sets = ["fatal_warnings"],
platform = "java8",
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"decider",
],
)

View file

@ -0,0 +1,42 @@
package com.twitter.tweetypie.decider.overrides
import com.twitter.decider.LocalOverrides
object TweetyPieDeciderOverrides extends LocalOverrides.Namespace("tweetypie", "tweetypie_") {
val CheckSpamOnRetweet: LocalOverrides.Override = feature("check_spam_on_retweet")
val CheckSpamOnTweet: LocalOverrides.Override = feature("check_spam_on_tweet")
val ConversationControlUseFeatureSwitchResults: LocalOverrides.Override = feature(
"conversation_control_use_feature_switch_results")
val ConversationControlTweetCreateEnabled: LocalOverrides.Override = feature(
"conversation_control_tweet_create_enabled")
val EnableExclusiveTweetControlValidation: LocalOverrides.Override = feature(
"enable_exclusive_tweet_control_validation")
val EnableHotKeyCaches: LocalOverrides.Override = feature("enable_hot_key_caches")
val HydrateConversationMuted: LocalOverrides.Override = feature("hydrate_conversation_muted")
val HydrateExtensionsOnWrite: LocalOverrides.Override = feature("hydrate_extensions_on_write")
val HydrateEscherbirdAnnotations: LocalOverrides.Override = feature(
"hydrate_escherbird_annotations")
val HydrateGnipProfileGeoEnrichment: LocalOverrides.Override = feature(
"hydrate_gnip_profile_geo_enrichment")
val HydratePastedPics: LocalOverrides.Override = feature("hydrate_pasted_pics")
val HydratePerspectivesEditsForOtherSafetyLevels: LocalOverrides.Override = feature(
"hydrate_perspectives_edits_for_other_levels")
val HydrateScrubEngagements: LocalOverrides.Override = feature("hydrate_scrub_engagements")
val LogRepoExceptions: LocalOverrides.Override = feature("log_repo_exceptions")
val MediaRefsHydratorIncludePastedMedia: LocalOverrides.Override = feature(
"media_refs_hydrator_include_pasted_media")
val ShortCircuitLikelyPartialTweetReads: LocalOverrides.Override = feature(
"short_circuit_likely_partial_tweet_reads_ms")
val RateLimitByLimiterService: LocalOverrides.Override = feature("rate_limit_by_limiter_service")
val RateLimitTweetCreationFailure: LocalOverrides.Override = feature(
"rate_limit_tweet_creation_failure")
val ReplyTweetConversationControlHydrationEnabled = feature(
"reply_tweet_conversation_control_hydration_enabled"
)
val DisableInviteViaMention = feature(
"disable_invite_via_mention"
)
val EnableRemoveUnmentionedImplicitMentions: LocalOverrides.Override = feature(
"enable_remove_unmentioned_implicit_mentions")
val useReplicatedDeleteTweet2: LocalOverrides.Override = feature("use_replicated_delete_tweet_2")
}

View file

@ -0,0 +1,15 @@
scala_library(
compiler_option_sets = ["fatal_warnings"],
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"finagle/finagle-core/src/main",
"incentives/jiminy/src/main/thrift/com/twitter/incentives/jiminy:thrift-scala",
"tweetypie/servo/util/src/main/scala",
"stitch/stitch-core",
"strato/src/main/scala/com/twitter/strato/client",
"tweetypie/server/src/main/scala/com/twitter/tweetypie/core",
"util/util-core",
"util/util-stats",
],
)

View file

@ -0,0 +1,165 @@
package com.twitter.tweetypie.jiminy.tweetypie
import com.twitter.finagle.stats.CategorizingExceptionStatsHandler
import com.twitter.finagle.stats.Stat
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.incentives.jiminy.thriftscala._
import com.twitter.servo.util.FutureArrow
import com.twitter.servo.util.Gate
import com.twitter.stitch.Stitch
import com.twitter.strato.thrift.ScroogeConvImplicits._
import com.twitter.strato.client.{Client => StratoClient}
import com.twitter.tweetypie.core.TweetCreateFailure
import com.twitter.util.Future
import com.twitter.util.Return
import com.twitter.util.Throw
case class NudgeBuilderRequest(
text: String,
inReplyToTweetId: Option[NudgeBuilder.TweetId],
conversationId: Option[NudgeBuilder.TweetId],
hasQuotedTweet: Boolean,
nudgeOptions: Option[CreateTweetNudgeOptions],
tweetId: Option[NudgeBuilder.TweetId])
trait NudgeBuilder extends FutureArrow[NudgeBuilderRequest, Unit] {
/**
* Check whether the user should receive a nudge instead of creating
* the Tweet. If nudgeOptions is None, then no nudge check will be
* performed.
*
* @return a Future.exception containing a [[TweetCreateFailure]] if the
* user should be nudged, or Future.Unit if the user should not be
* nudged.
*/
def apply(
request: NudgeBuilderRequest
): Future[Unit]
}
object NudgeBuilder {
type Type = FutureArrow[NudgeBuilderRequest, Unit]
type TweetId = Long
// darkTrafficCreateNudgeOptions ensure that our dark traffic sends a request that will
// accurately test the Jiminy backend. in this case, we specify that we want checks for all
// possible nudge types
private[this] val darkTrafficCreateNudgeOptions = Some(
CreateTweetNudgeOptions(
requestedNudgeTypes = Some(
Set(
TweetNudgeType.PotentiallyToxicTweet,
TweetNudgeType.ReviseOrMute,
TweetNudgeType.ReviseOrHideThenBlock,
TweetNudgeType.ReviseOrBlock
)
)
)
)
private[this] def mkJiminyRequest(
request: NudgeBuilderRequest,
isDarkRequest: Boolean = false
): CreateTweetNudgeRequest = {
val tweetType =
if (request.inReplyToTweetId.nonEmpty) TweetType.Reply
else if (request.hasQuotedTweet) TweetType.QuoteTweet
else TweetType.OriginalTweet
CreateTweetNudgeRequest(
tweetText = request.text,
tweetType = tweetType,
inReplyToTweetId = request.inReplyToTweetId,
conversationId = request.conversationId,
createTweetNudgeOptions =
if (isDarkRequest) darkTrafficCreateNudgeOptions else request.nudgeOptions,
tweetId = request.tweetId
)
}
/**
* NudgeBuilder implemented by calling the strato column `incentives/createNudge`.
*
* Stats recorded:
* - latency_ms: Latency histogram (also implicitly number of
* invocations). This is counted only in the case that a nudge
* check was requested (`nudgeOptions` is non-empty)
*
* - nudge: The nudge check succeeded and a nudge was created.
*
* - no_nudge: The nudge check succeeded, but no nudge was created.
*
* - failures: Calling strato to create a nudge failed. Broken out
* by exception.
*/
def apply(
nudgeArrow: FutureArrow[CreateTweetNudgeRequest, CreateTweetNudgeResponse],
enableDarkTraffic: Gate[Unit],
stats: StatsReceiver
): NudgeBuilder = {
new NudgeBuilder {
private[this] val nudgeLatencyStat = stats.stat("latency_ms")
private[this] val nudgeCounter = stats.counter("nudge")
private[this] val noNudgeCounter = stats.counter("no_nudge")
private[this] val darkRequestCounter = stats.counter("dark_request")
private[this] val nudgeExceptionHandler = new CategorizingExceptionStatsHandler
override def apply(
request: NudgeBuilderRequest
): Future[Unit] =
request.nudgeOptions match {
case None =>
if (enableDarkTraffic()) {
darkRequestCounter.incr()
Stat
.timeFuture(nudgeLatencyStat) {
nudgeArrow(mkJiminyRequest(request, isDarkRequest = true))
}
.transform { _ =>
// ignore the response since it is a dark request
Future.Done
}
} else {
Future.Done
}
case Some(_) =>
Stat
.timeFuture(nudgeLatencyStat) {
nudgeArrow(mkJiminyRequest(request))
}
.transform {
case Throw(e) =>
nudgeExceptionHandler.record(stats, e)
// If we failed to invoke the nudge column, then
// just continue on with the Tweet creation.
Future.Done
case Return(CreateTweetNudgeResponse(Some(nudge))) =>
nudgeCounter.incr()
Future.exception(TweetCreateFailure.Nudged(nudge = nudge))
case Return(CreateTweetNudgeResponse(None)) =>
noNudgeCounter.incr()
Future.Done
}
}
}
}
def apply(
strato: StratoClient,
enableDarkTraffic: Gate[Unit],
stats: StatsReceiver
): NudgeBuilder = {
val executer =
strato.executer[CreateTweetNudgeRequest, CreateTweetNudgeResponse](
"incentives/createTweetNudge")
val nudgeArrow: FutureArrow[CreateTweetNudgeRequest, CreateTweetNudgeResponse] = { req =>
Stitch.run(executer.execute(req))
}
apply(nudgeArrow, enableDarkTraffic, stats)
}
}

View file

@ -0,0 +1,18 @@
scala_library(
sources = ["*.scala"],
compiler_option_sets = ["fatal_warnings"],
platform = "java8",
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"src/java/com/twitter/common/text/language:language-identifier",
"src/java/com/twitter/common/text/language:locale-util",
"src/java/com/twitter/common/text/pipeline",
"src/java/com/twitter/common/text/token",
"src/java/com/twitter/common_internal/text",
"src/java/com/twitter/common_internal/text/version",
"tweetypie/src/resources/com/twitter/tweetypie/matching",
"util/util-core/src/main/scala/com/twitter/concurrent",
"util/util-core/src/main/scala/com/twitter/io",
],
)

View file

@ -0,0 +1,92 @@
package com.twitter.tweetypie.matching
object TokenSequence {
/**
* Is `suffix` a suffix of `s`, starting at `offset` in `s`?
*/
def hasSuffixAt(s: CharSequence, suffix: CharSequence, offset: Int): Boolean =
if (offset == 0 && (s.eq(suffix) || s == suffix)) {
true
} else if (suffix.length != (s.length - offset)) {
false
} else {
@annotation.tailrec
def go(i: Int): Boolean =
if (i == suffix.length) true
else if (suffix.charAt(i) == s.charAt(offset + i)) go(i + 1)
else false
go(0)
}
/**
* Do two [[CharSequence]]s contain the same characters?
*
* [[CharSequence]] equality is not sufficient because
* [[CharSequence]]s of different types may not consider other
* [[CharSequence]]s containing the same characters equivalent.
*/
def sameCharacters(s1: CharSequence, s2: CharSequence): Boolean =
hasSuffixAt(s1, s2, 0)
/**
* This method implements the product definition of a token matching a
* keyword. That definition is:
*
* - The token contains the same characters as the keyword.
* - The token contains the same characters as the keyword after
* dropping a leading '#' or '@' from the token.
*
* The intention is that a keyword matches an identical hashtag, but
* if the keyword itself is a hashtag, it only matches the hashtag
* form.
*
* The tokenization process should rule out tokens or keywords that
* start with multiple '#' characters, even though this implementation
* allows for e.g. token "##a" to match "#a".
*/
def tokenMatches(token: CharSequence, keyword: CharSequence): Boolean =
if (sameCharacters(token, keyword)) true
else if (token.length == 0) false
else {
val tokenStart = token.charAt(0)
(tokenStart == '#' || tokenStart == '@') && hasSuffixAt(token, keyword, 1)
}
}
/**
* A sequence of normalized tokens. The sequence depends on the locale
* in which the text was parsed and the version of the penguin library
* that was used at tokenization time.
*/
case class TokenSequence private[matching] (toIndexedSeq: IndexedSeq[CharSequence]) {
import TokenSequence.tokenMatches
private def apply(i: Int): CharSequence = toIndexedSeq(i)
def isEmpty: Boolean = toIndexedSeq.isEmpty
def nonEmpty: Boolean = toIndexedSeq.nonEmpty
/**
* Does the supplied sequence of keywords match a consecutive sequence
* of tokens within this sequence?
*/
def containsKeywordSequence(keywords: TokenSequence): Boolean = {
val finalIndex = toIndexedSeq.length - keywords.toIndexedSeq.length
@annotation.tailrec
def matchesAt(offset: Int, i: Int): Boolean =
if (i >= keywords.toIndexedSeq.length) true
else if (tokenMatches(this(i + offset), keywords(i))) matchesAt(offset, i + 1)
else false
@annotation.tailrec
def search(offset: Int): Boolean =
if (offset > finalIndex) false
else if (matchesAt(offset, 0)) true
else search(offset + 1)
search(0)
}
}

View file

@ -0,0 +1,156 @@
package com.twitter.tweetypie.matching
import com.twitter.common.text.language.LocaleUtil
import com.twitter.common_internal.text.pipeline.TwitterTextNormalizer
import com.twitter.common_internal.text.pipeline.TwitterTextTokenizer
import com.twitter.common_internal.text.version.PenguinVersion
import com.twitter.concurrent.Once
import com.twitter.io.StreamIO
import java.util.Locale
import scala.collection.JavaConverters._
/**
* Extract a sequence of normalized tokens from the input text. The
* normalization and tokenization are properly configured for keyword
* matching between texts.
*/
trait Tokenizer {
def tokenize(input: String): TokenSequence
}
object Tokenizer {
/**
* When a Penguin version is not explicitly specified, use this
* version of Penguin to perform normalization and tokenization. If
* you cache tokenized text, be sure to store the version as well, to
* avoid comparing text that was normalized with different algorithms.
*/
val DefaultPenguinVersion: PenguinVersion = PenguinVersion.PENGUIN_6
/**
* If you already know the locale of the text that is being tokenized,
* use this method to get a tokenizer that is much more efficient than
* the Tweet or Query tokenizer, since it does not have to perform
* language detection.
*/
def forLocale(locale: Locale): Tokenizer = get(locale, DefaultPenguinVersion)
/**
* Obtain a `Tokenizer` that will tokenize the text for the given
* locale and version of the Penguin library.
*/
def get(locale: Locale, version: PenguinVersion): Tokenizer =
TokenizerFactories(version).forLocale(locale)
/**
* Encapsulates the configuration and use of [[TwitterTextTokenizer]]
* and [[TwitterTextNormalizer]].
*/
private[this] class TokenizerFactory(version: PenguinVersion) {
// The normalizer is thread-safe, so share one instance.
private[this] val normalizer =
(new TwitterTextNormalizer.Builder(version)).build()
// The TwitterTextTokenizer is relatively expensive to build,
// and is not thread safe, so keep instances of it in a
// ThreadLocal.
private[this] val local =
new ThreadLocal[TwitterTextTokenizer] {
override def initialValue: TwitterTextTokenizer =
(new TwitterTextTokenizer.Builder(version)).build()
}
/**
* Obtain a [[Tokenizer]] for this combination of [[PenguinVersion]]
* and [[Locale]].
*/
def forLocale(locale: Locale): Tokenizer =
new Tokenizer {
override def tokenize(input: String): TokenSequence = {
val stream = local.get.getTwitterTokenStreamFor(locale)
stream.reset(normalizer.normalize(input, locale))
val builder = IndexedSeq.newBuilder[CharSequence]
while (stream.incrementToken) builder += stream.term()
TokenSequence(builder.result())
}
}
}
/**
* Since there are a small number of Penguin versions, eagerly
* initialize a TokenizerFactory for each version, to avoid managing
* mutable state.
*/
private[this] val TokenizerFactories: PenguinVersion => TokenizerFactory =
PenguinVersion.values.map(v => v -> new TokenizerFactory(v)).toMap
/**
* The set of locales used in warmup. These locales are mentioned in
* the logic of TwitterTextTokenizer and TwitterTextNormalizer.
*/
private[this] val WarmUpLocales: Seq[Locale] =
Seq
.concat(
Seq(
Locale.JAPANESE,
Locale.KOREAN,
LocaleUtil.UNKNOWN,
LocaleUtil.THAI,
LocaleUtil.ARABIC,
LocaleUtil.SWEDISH
),
LocaleUtil.CHINESE_JAPANESE_LOCALES.asScala,
LocaleUtil.CJK_LOCALES.asScala
)
.toSet
.toArray
.toSeq
/**
* Load the default inputs that are used for warming up this library.
*/
def warmUpCorpus(): Seq[String] = {
val stream = getClass.getResourceAsStream("warmup-text.txt")
val bytes =
try StreamIO.buffer(stream)
finally stream.close()
bytes.toString("UTF-8").linesIterator.toArray.toSeq
}
/**
* Exercise the functionality of this library on the specified
* strings. In general, prefer [[warmUp]] to this method.
*/
def warmUpWith(ver: PenguinVersion, texts: Iterable[String]): Unit =
texts.foreach { txt =>
// Exercise each locale
WarmUpLocales.foreach { loc =>
Tokenizer.get(loc, ver).tokenize(txt)
UserMutes.builder().withPenguinVersion(ver).withLocale(loc).validate(txt)
}
// Exercise language detection
TweetTokenizer.get(ver).tokenize(txt)
UserMutes.builder().withPenguinVersion(ver).validate(txt)
}
private[this] val warmUpOnce = Once(warmUpWith(DefaultPenguinVersion, warmUpCorpus()))
/**
* The creation of the first TwitterTextTokenizer is relatively
* expensive, and tokenizing some texts may cause significant
* initialization.
*
* This method exercises the functionality of this library
* with a range of texts in order to perform as much initialization as
* possible before the library is used in a latency-sensitive way.
*
* The warmup routine will only run once. Subsequent invocations of
* `warmUp` will no do additional work, and will return once warmup is
* complete.
*
* The warmup will take on the order of seconds.
*/
def warmUp(): Unit = warmUpOnce()
}

View file

@ -0,0 +1,45 @@
package com.twitter.tweetypie.matching
import com.twitter.common.text.pipeline.TwitterLanguageIdentifier
import com.twitter.common_internal.text.version.PenguinVersion
import java.util.Locale
object TweetTokenizer extends Tokenizer {
type LocalePicking = Option[Locale] => Tokenizer
/**
* Get a Tokenizer-producing function that uses the supplied locale
* to select an appropriate Tokenizer.
*/
def localePicking: LocalePicking = {
case None => TweetTokenizer
case Some(locale) => Tokenizer.forLocale(locale)
}
private[this] val tweetLangIdentifier =
(new TwitterLanguageIdentifier.Builder).buildForTweet()
/**
* Get a Tokenizer that performs Tweet language detection, and uses
* that result to tokenize the text. If you already know the locale of
* the tweet text, use `Tokenizer.get`, because it's much
* cheaper.
*/
def get(version: PenguinVersion): Tokenizer =
new Tokenizer {
override def tokenize(text: String): TokenSequence = {
val locale = tweetLangIdentifier.identify(text).getLocale
Tokenizer.get(locale, version).tokenize(text)
}
}
private[this] val Default = get(Tokenizer.DefaultPenguinVersion)
/**
* Tokenize the given text using Tweet language detection and
* `Tokenizer.DefaultPenguinVersion`. Prefer `Tokenizer.forLocale` if
* you already know the language of the text.
*/
override def tokenize(tweetText: String): TokenSequence =
Default.tokenize(tweetText)
}

View file

@ -0,0 +1,128 @@
package com.twitter.tweetypie.matching
import com.twitter.common.text.pipeline.TwitterLanguageIdentifier
import com.twitter.common_internal.text.version.PenguinVersion
import java.util.Locale
import scala.collection.JavaConversions.asScalaBuffer
object UserMutesBuilder {
private[matching] val Default =
new UserMutesBuilder(Tokenizer.DefaultPenguinVersion, None)
private val queryLangIdentifier =
(new TwitterLanguageIdentifier.Builder).buildForQuery()
}
class UserMutesBuilder private (penguinVersion: PenguinVersion, localeOpt: Option[Locale]) {
/**
* Use the specified Penguin version when tokenizing a keyword mute
* string. In general, use the default version, unless you need to
* specify a particular version for compatibility with another system
* that is using that version.
*/
def withPenguinVersion(ver: PenguinVersion): UserMutesBuilder =
if (ver == penguinVersion) this
else new UserMutesBuilder(ver, localeOpt)
/**
* Use the specified locale when tokenizing a keyword mute string.
*/
def withLocale(locale: Locale): UserMutesBuilder =
if (localeOpt.contains(locale)) this
else new UserMutesBuilder(penguinVersion, Some(locale))
/**
* When tokenizing a user mute list, detect the language of the
* text. This is significantly more expensive than using a predefined
* locale, but is appropriate when the locale is not yet known.
*/
def detectLocale(): UserMutesBuilder =
if (localeOpt.isEmpty) this
else new UserMutesBuilder(penguinVersion, localeOpt)
private[this] lazy val tokenizer =
localeOpt match {
case None =>
// No locale was specified, so use a Tokenizer that performs
// language detection before tokenizing.
new Tokenizer {
override def tokenize(text: String): TokenSequence = {
val locale = UserMutesBuilder.queryLangIdentifier.identify(text).getLocale
Tokenizer.get(locale, penguinVersion).tokenize(text)
}
}
case Some(locale) =>
Tokenizer.get(locale, penguinVersion)
}
/**
* Given a list of the user's raw keyword mutes, return a preprocessed
* set of mutes suitable for matching against tweet text. If the input
* contains any phrases that fail validation, then they will be
* dropped.
*/
def build(rawInput: Seq[String]): UserMutes =
UserMutes(rawInput.flatMap(validate(_).right.toOption))
/**
* Java-friendly API for processing a user's list of raw keyword mutes
* into a preprocessed form suitable for matching against text.
*/
def fromJavaList(rawInput: java.util.List[String]): UserMutes =
build(asScalaBuffer(rawInput).toSeq)
/**
* Validate the raw user input muted phrase. Currently, the only
* inputs that are not valid for keyword muting are those inputs that
* do not contain any keywords, because those inputs would match all
* tweets.
*/
def validate(mutedPhrase: String): Either[UserMutes.ValidationError, TokenSequence] = {
val keywords = tokenizer.tokenize(mutedPhrase)
if (keywords.isEmpty) UserMutes.EmptyPhraseError else Right(keywords)
}
}
object UserMutes {
sealed trait ValidationError
/**
* The phrase's tokenization did not produce any tokens
*/
case object EmptyPhrase extends ValidationError
private[matching] val EmptyPhraseError = Left(EmptyPhrase)
/**
* Get a [[UserMutesBuilder]] that uses the default Penguin version and
* performs language identification to choose a locale.
*/
def builder(): UserMutesBuilder = UserMutesBuilder.Default
}
/**
* A user's muted keyword list, preprocessed into token sequences.
*/
case class UserMutes private[matching] (toSeq: Seq[TokenSequence]) {
/**
* Do any of the users' muted keyword sequences occur within the
* supplied text?
*/
def matches(text: TokenSequence): Boolean =
toSeq.exists(text.containsKeywordSequence)
/**
* Find all positions of matching muted keyword from the user's
* muted keyword list
*/
def find(text: TokenSequence): Seq[Int] =
toSeq.zipWithIndex.collect {
case (token, index) if text.containsKeywordSequence(token) => index
}
def isEmpty: Boolean = toSeq.isEmpty
def nonEmpty: Boolean = toSeq.nonEmpty
}

View file

@ -0,0 +1,17 @@
scala_library(
sources = ["*.scala"],
compiler_option_sets = ["fatal_warnings"],
platform = "java8",
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"mediaservices/commons/src/main/thrift:thrift-scala",
"scrooge/scrooge-core/src/main/scala",
"tweetypie/servo/util/src/main/scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:media-entity-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:tweet-scala",
"tco-util",
"tweetypie/common/src/scala/com/twitter/tweetypie/util",
"util/util-logging/src/main/scala/com/twitter/logging",
],
)

View file

@ -0,0 +1,149 @@
package com.twitter.tweetypie
package media
import com.twitter.mediaservices.commons.thriftscala.MediaCategory
import com.twitter.mediaservices.commons.tweetmedia.thriftscala._
import com.twitter.tco_util.TcoSlug
import com.twitter.tweetypie.thriftscala._
import com.twitter.tweetypie.util.TweetLenses
/**
* A smörgåsbord of media-related helper methods.
*/
object Media {
val AnimatedGifContentType = "video/mp4 codecs=avc1.42E0"
case class MediaTco(expandedUrl: String, url: String, displayUrl: String)
val ImageContentTypes: Set[MediaContentType] =
Set[MediaContentType](
MediaContentType.ImageJpeg,
MediaContentType.ImagePng,
MediaContentType.ImageGif
)
val AnimatedGifContentTypes: Set[MediaContentType] =
Set[MediaContentType](
MediaContentType.VideoMp4
)
val VideoContentTypes: Set[MediaContentType] =
Set[MediaContentType](
MediaContentType.VideoGeneric
)
val InUseContentTypes: Set[MediaContentType] =
Set[MediaContentType](
MediaContentType.ImageGif,
MediaContentType.ImageJpeg,
MediaContentType.ImagePng,
MediaContentType.VideoMp4,
MediaContentType.VideoGeneric
)
def isImage(contentType: MediaContentType): Boolean =
ImageContentTypes.contains(contentType)
def contentTypeToString(contentType: MediaContentType): String =
contentType match {
case MediaContentType.ImageGif => "image/gif"
case MediaContentType.ImageJpeg => "image/jpeg"
case MediaContentType.ImagePng => "image/png"
case MediaContentType.VideoMp4 => "video/mp4"
case MediaContentType.VideoGeneric => "video"
case _ => throw new IllegalArgumentException(s"UnknownMediaContentType: $contentType")
}
def stringToContentType(str: String): MediaContentType =
str match {
case "image/gif" => MediaContentType.ImageGif
case "image/jpeg" => MediaContentType.ImageJpeg
case "image/png" => MediaContentType.ImagePng
case "video/mp4" => MediaContentType.VideoMp4
case "video" => MediaContentType.VideoGeneric
case _ => throw new IllegalArgumentException(s"Unknown Content Type String: $str")
}
def extensionForContentType(cType: MediaContentType): String =
cType match {
case MediaContentType.ImageJpeg => "jpg"
case MediaContentType.ImagePng => "png"
case MediaContentType.ImageGif => "gif"
case MediaContentType.VideoMp4 => "mp4"
case MediaContentType.VideoGeneric => ""
case _ => "unknown"
}
/**
* Extract a URL entity from a media entity.
*/
def extractUrlEntity(mediaEntity: MediaEntity): UrlEntity =
UrlEntity(
fromIndex = mediaEntity.fromIndex,
toIndex = mediaEntity.toIndex,
url = mediaEntity.url,
expanded = Some(mediaEntity.expandedUrl),
display = Some(mediaEntity.displayUrl)
)
/**
* Copy the fields from the URL entity into the media entity.
*/
def copyFromUrlEntity(mediaEntity: MediaEntity, urlEntity: UrlEntity): MediaEntity = {
val expandedUrl =
urlEntity.expanded.orElse(Option(mediaEntity.expandedUrl)).getOrElse(urlEntity.url)
val displayUrl =
urlEntity.url match {
case TcoSlug(slug) => MediaUrl.Display.fromTcoSlug(slug)
case _ => urlEntity.expanded.getOrElse(urlEntity.url)
}
mediaEntity.copy(
fromIndex = urlEntity.fromIndex,
toIndex = urlEntity.toIndex,
url = urlEntity.url,
expandedUrl = expandedUrl,
displayUrl = displayUrl
)
}
def getAspectRatio(size: MediaSize): AspectRatio =
getAspectRatio(size.width, size.height)
def getAspectRatio(width: Int, height: Int): AspectRatio = {
if (width == 0 || height == 0) {
throw new IllegalArgumentException(s"Dimensions must be non zero: ($width, $height)")
}
def calculateGcd(a: Int, b: Int): Int =
if (b == 0) a else calculateGcd(b, a % b)
val gcd = calculateGcd(math.max(width, height), math.min(width, height))
AspectRatio((width / gcd).toShort, (height / gcd).toShort)
}
/**
* Return just the media that belongs to this tweet
*/
def ownMedia(tweet: Tweet): Seq[MediaEntity] =
TweetLenses.media.get(tweet).filter(isOwnMedia(tweet.id, _))
/**
* Does the given media entity, which is was found on the tweet with the specified
* tweetId, belong to that tweet?
*/
def isOwnMedia(tweetId: TweetId, entity: MediaEntity): Boolean =
entity.sourceStatusId.forall(_ == tweetId)
/**
* Mixed Media is any case where there is more than one media item & any of them is not an image.
*/
def isMixedMedia(mediaEntities: Seq[MediaEntity]): Boolean =
mediaEntities.length > 1 && (mediaEntities.flatMap(_.mediaInfo).exists {
case _: MediaInfo.ImageInfo => false
case _ => true
} ||
mediaEntities.flatMap(_.mediaKey).map(_.mediaCategory).exists(_ != MediaCategory.TweetImage))
}

View file

@ -0,0 +1,108 @@
package com.twitter.tweetypie
package media
import com.twitter.logging.Logger
import com.twitter.tweetypie.thriftscala.MediaEntity
import com.twitter.tweetypie.thriftscala.UrlEntity
/**
* Creating and parsing tweet media entity URLs.
*
* There are four kinds of URL in a media entity:
*
* - Display URLs: pic.twitter.com aliases for the short URL, for
* embedding in the tweet text.
*
* - Short URLs: regular t.co URLs that expand to the permalink URL.
*
* - Permalink URLs: link to a page that displays the media after
* doing authorization
*
* - Asset URLs: links to the actual media asset.
*
*/
object MediaUrl {
private[this] val log = Logger(getClass)
/**
* The URL that should be filled in to the displayUrl field of the
* media entity. This URL behaves exactly the same as a t.co link
* (only the domain is different.)
*/
object Display {
val Root = "pic.twitter.com/"
def fromTcoSlug(tcoSlug: String): String = Root + tcoSlug
}
/**
* The link target for the link in the tweet text (the expanded URL
* for the media, copied from the URL entity.) For native photos,
* this is the tweet permalink page.
*
* For users without a screen name ("handleless" or NoScreenName users)
* a permalink to /i/status/:tweet_id is used.
*/
object Permalink {
val Root = "https://twitter.com/"
val Internal = "i"
val PhotoSuffix = "/photo/1"
val VideoSuffix = "/video/1"
def apply(screenName: String, tweetId: TweetId, isVideo: Boolean): String =
Root +
(if (screenName.isEmpty) Internal else screenName) +
"/status/" +
tweetId +
(if (isVideo) VideoSuffix else PhotoSuffix)
private[this] val PermalinkRegex =
"""https?://twitter.com/(?:#!/)?\w+/status/(\d+)/(?:photo|video)/\d+""".r
private[this] def getTweetId(permalink: String): Option[TweetId] =
permalink match {
case PermalinkRegex(tweetIdStr) =>
try {
Some(tweetIdStr.toLong)
} catch {
// Digits too big to fit in a Long
case _: NumberFormatException => None
}
case _ => None
}
def getTweetId(urlEntity: UrlEntity): Option[TweetId] =
urlEntity.expanded.flatMap(getTweetId)
def hasTweetId(permalink: String, tweetId: TweetId): Boolean =
getTweetId(permalink).contains(tweetId)
def hasTweetId(mediaEntity: MediaEntity, tweetId: TweetId): Boolean =
hasTweetId(mediaEntity.expandedUrl, tweetId)
def hasTweetId(urlEntity: UrlEntity, tweetId: TweetId): Boolean =
getTweetId(urlEntity).contains(tweetId)
}
/**
* Converts a url that starts with "https://" to one that starts with "http://".
*/
def httpsToHttp(url: String): String =
url.replace("https://", "http://")
/**
* Gets the last path element from an asset url. This exists temporarily to support
* the now deprecated mediaPath element in MediaEntity.
*/
def mediaPathFromUrl(url: String): String =
url.lastIndexOf('/') match {
case -1 =>
log.error("Invalid media path. Could not find last element: " + url)
// Better to return a broken preview URL to the client
// than to fail the whole request.
""
case idx =>
url.substring(idx + 1)
}
}

View file

@ -0,0 +1,7 @@
package com.twitter.tweetypie
package object media {
type TweetId = Long
type UserId = Long
type MediaId = Long
}

View file

@ -0,0 +1,80 @@
package com.twitter.tweetypie.storage
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.stitch.Stitch
import com.twitter.storage.client.manhattan.kv.ManhattanValue
import com.twitter.tweetypie.storage.TweetUtils.collectWithRateLimitCheck
import com.twitter.tweetypie.storage_internal.thriftscala.StoredTweet
import com.twitter.tweetypie.thriftscala.Tweet
import com.twitter.util.Time
object AddTweetHandler {
private[storage] type InternalAddTweet = (
Tweet,
ManhattanOperations.Insert,
Scribe,
StatsReceiver,
Time
) => Stitch[Unit]
def apply(
insert: ManhattanOperations.Insert,
scribe: Scribe,
stats: StatsReceiver
): TweetStorageClient.AddTweet =
tweet => doAddTweet(tweet, insert, scribe, stats, Time.now)
def makeRecords(
storedTweet: StoredTweet,
timestamp: Time
): Seq[TweetManhattanRecord] = {
val core = CoreFieldsCodec.fromTweet(storedTweet)
val packedCoreFieldsBlob = CoreFieldsCodec.toTFieldBlob(core)
val coreRecord =
TweetManhattanRecord(
TweetKey.coreFieldsKey(storedTweet.id),
ManhattanValue(TFieldBlobCodec.toByteBuffer(packedCoreFieldsBlob), Some(timestamp))
)
val otherFieldIds =
TweetFields.nonCoreInternalFields ++ TweetFields.getAdditionalFieldIds(storedTweet)
val otherFields =
storedTweet
.getFieldBlobs(otherFieldIds)
.map {
case (fieldId, tFieldBlob) =>
TweetManhattanRecord(
TweetKey.fieldKey(storedTweet.id, fieldId),
ManhattanValue(TFieldBlobCodec.toByteBuffer(tFieldBlob), Some(timestamp))
)
}
.toSeq
otherFields :+ coreRecord
}
private[storage] val doAddTweet: InternalAddTweet = (
tweet: Tweet,
insert: ManhattanOperations.Insert,
scribe: Scribe,
stats: StatsReceiver,
timestamp: Time
) => {
assert(tweet.coreData.isDefined, s"Tweet ${tweet.id} is missing coreData: $tweet")
val storedTweet = StorageConversions.toStoredTweet(tweet)
val records = makeRecords(storedTweet, timestamp)
val inserts = records.map(insert)
val insertsWithRateLimitCheck =
Stitch.collect(inserts.map(_.liftToTry)).map(collectWithRateLimitCheck).lowerFromTry
Stats.updatePerFieldQpsCounters(
"addTweet",
TweetFields.getAdditionalFieldIds(storedTweet),
1,
stats
)
insertsWithRateLimitCheck.unit.onSuccess { _ => scribe.logAdded(storedTweet) }
}
}

View file

@ -0,0 +1,47 @@
scala_library(
sources = ["*.scala"],
compiler_option_sets = ["fatal_warnings"],
platform = "java8",
strict_deps = True,
tags = [
"bazel-compatible",
"bazel-incompatible-scaladoc",
],
dependencies = [
"3rdparty/jvm/com/chuusai:shapeless",
"3rdparty/jvm/com/fasterxml/jackson/core:jackson-databind",
"3rdparty/jvm/com/fasterxml/jackson/module:jackson-module-scala",
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/twitter/bijection:core",
"3rdparty/jvm/com/twitter/bijection:scrooge",
"3rdparty/jvm/com/twitter/bijection:thrift",
"3rdparty/jvm/commons-codec",
"3rdparty/jvm/org/apache/thrift:libthrift",
"diffshow",
"finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authorization",
"finagle/finagle-core/src/main",
"finagle/finagle-stats",
"finagle/finagle-thriftmux/src/main/scala",
"mediaservices/commons/src/main/thrift:thrift-scala",
"scrooge/scrooge-serializer/src/main/scala",
"tweetypie/servo/repo/src/main/scala",
"tweetypie/servo/util",
"snowflake:id",
"src/thrift/com/twitter/escherbird:media-annotation-structs-scala",
"src/thrift/com/twitter/manhattan:internal-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:media-entity-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:service-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:tweet-scala",
"stitch/stitch-core",
"storage/clients/manhattan/client/src/main/scala",
"tbird-thrift:scala",
"tweetypie/common/src/scala/com/twitter/tweetypie/additionalfields",
"tweetypie/common/src/scala/com/twitter/tweetypie/client_id",
"tweetypie/common/src/scala/com/twitter/tweetypie/util",
"tweetypie/common/src/thrift/com/twitter/tweetypie/storage_internal:storage_internal-scala",
"util-internal/scribe",
"util/util-core:scala",
"util/util-slf4j-api/src/main/scala/com/twitter/util/logging",
"util/util-stats/src/main/scala",
],
)

View file

@ -0,0 +1,20 @@
package com.twitter.tweetypie.storage
import com.twitter.util.Time
object BounceDeleteHandler {
def apply(
insert: ManhattanOperations.Insert,
scribe: Scribe
): TweetStorageClient.BounceDelete =
tweetId => {
val mhTimestamp = Time.now
val bounceDeleteRecord = TweetStateRecord
.BounceDeleted(tweetId, mhTimestamp.inMillis)
.toTweetMhRecord
insert(bounceDeleteRecord).onSuccess { _ =>
scribe.logRemoved(tweetId, mhTimestamp, isSoftDeleted = true)
}
}
}

View file

@ -0,0 +1,242 @@
package com.twitter.tweetypie.storage
import com.twitter.bijection.Conversion.asMethod
import com.twitter.bijection.Injection
import com.twitter.scrooge.TFieldBlob
import com.twitter.storage.client.manhattan.kv._
import com.twitter.tweetypie.storage.Response.FieldResponse
import com.twitter.tweetypie.storage.Response.FieldResponseCode
import com.twitter.tweetypie.storage_internal.thriftscala.CoreFields
import com.twitter.tweetypie.storage_internal.thriftscala.InternalTweet
import com.twitter.tweetypie.storage_internal.thriftscala.StoredTweet
import java.io.ByteArrayOutputStream
import java.nio.ByteBuffer
import org.apache.thrift.protocol.TBinaryProtocol
import org.apache.thrift.transport.TIOStreamTransport
import org.apache.thrift.transport.TMemoryInputTransport
import scala.collection.immutable
import scala.util.control.NoStackTrace
// NOTE: All field ids and Tweet structure in this file correspond to the StoredTweet struct ONLY
object ByteArrayCodec {
def toByteBuffer(byteArray: Array[Byte]): ByteBuffer = byteArray.as[ByteBuffer]
def fromByteBuffer(buffer: ByteBuffer): Array[Byte] = buffer.as[Array[Byte]]
}
object StringCodec {
private val string2ByteBuffer = Injection.connect[String, Array[Byte], ByteBuffer]
def toByteBuffer(strValue: String): ByteBuffer = string2ByteBuffer(strValue)
def fromByteBuffer(buffer: ByteBuffer): String = string2ByteBuffer.invert(buffer).get
}
/**
* Terminology
* -----------
* Tweet id field : The field number of 'tweetId' in the 'Tweet' thrift structure (i.e "1")
*
* First AdditionalField id : The ID if the first additional field in 'Tweet' thrift structure. All field Ids less than this are
* considered internal and all the ids greater than or equal to this field id are considered 'Additional fields'.
* This is set to 100.
*
* Internal Fields : Fields with ids [1 to firstAdditionalFieldid) (excluding firstAdditionalFieldId)
*
* Core fields : (Subset of Internal fields)- Fields with ids [1 to 8, 19]. These fields are "packed" together and stored
* under a single key. This key is referred to as "CoreFieldsKey" (see @TweetKeyType.CoreFieldsKey).
* Note: Actually field 1 is skipped when packing as this field is the tweet id and it need not be
* explicitly stored since the pkey already contains the tweet Id)
*
* Root Core field id : The field id under which the packed core fields are stored in Manhattan. (This is field Id "1")
*
* Required fields : (Subset of Core fields) - Fields with ids [1 to 5] that MUST be present on every tweet.
*
* Additional Fields : All fields with field ids >= 'firstAdditionalFieldId'
*
* Compiled Additional fields : (Subset of Additional Fields) - All fields that the storage library knows about
* (i.e present on the latest storage_internal.thrift that is compiled-in).
*
* Passthrough fields : (Subset of Additional Fields) - The fields on storage_internal.thrift that the storage library is NOT aware of
* These field ids are is obtained looking at the "_passThroughFields" member of the scrooge-generated
* 'Tweet' object.
*
* coreFieldsIdInInternalTweet: This is the field id of the core fields (the only field) in the Internal Tweet struct
*/
object TweetFields {
val firstAdditionalFieldId: Short = 100
val tweetIdField: Short = 1
val geoFieldId: Short = 9
// The field under which all the core field values are stored (in serialized form).
val rootCoreFieldId: Short = 1
val coreFieldIds: immutable.IndexedSeq[FieldId] = {
val quotedTweetFieldId: Short = 19
(1 to 8).map(_.toShort) ++ Seq(quotedTweetFieldId)
}
val requiredFieldIds: immutable.IndexedSeq[FieldId] = (1 to 5).map(_.toShort)
val coreFieldsIdInInternalTweet: Short = 1
val compiledAdditionalFieldIds: Seq[FieldId] =
StoredTweet.metaData.fields.filter(_.id >= firstAdditionalFieldId).map(_.id)
val internalFieldIds: Seq[FieldId] =
StoredTweet.metaData.fields.filter(_.id < firstAdditionalFieldId).map(_.id)
val nonCoreInternalFields: Seq[FieldId] =
(internalFieldIds.toSet -- coreFieldIds.toSet).toSeq
def getAdditionalFieldIds(tweet: StoredTweet): Seq[FieldId] =
compiledAdditionalFieldIds ++ tweet._passthroughFields.keys.toSeq
}
/**
* Helper object to convert TFieldBlob to ByteBuffer that gets stored in Manhattan.
*
* The following is the format in which the TFieldBlob gets stored:
* [Version][TField][TFieldBlob]
*/
object TFieldBlobCodec {
val BinaryProtocolFactory: TBinaryProtocol.Factory = new TBinaryProtocol.Factory()
val FormatVersion = 1.0
def toByteBuffer(tFieldBlob: TFieldBlob): ByteBuffer = {
val baos = new ByteArrayOutputStream()
val prot = BinaryProtocolFactory.getProtocol(new TIOStreamTransport(baos))
prot.writeDouble(FormatVersion)
prot.writeFieldBegin(tFieldBlob.field)
prot.writeBinary(ByteArrayCodec.toByteBuffer(tFieldBlob.data))
ByteArrayCodec.toByteBuffer(baos.toByteArray)
}
def fromByteBuffer(buffer: ByteBuffer): TFieldBlob = {
val byteArray = ByteArrayCodec.fromByteBuffer(buffer)
val prot = BinaryProtocolFactory.getProtocol(new TMemoryInputTransport(byteArray))
val version = prot.readDouble()
if (version != FormatVersion) {
throw new VersionMismatchError(
"Version mismatch in decoding ByteBuffer to TFieldBlob. " +
"Actual version: " + version + ". Expected version: " + FormatVersion
)
}
val tField = prot.readFieldBegin()
val dataBuffer = prot.readBinary()
val data = ByteArrayCodec.fromByteBuffer(dataBuffer)
TFieldBlob(tField, data)
}
}
/**
* Helper object to help convert 'CoreFields' object to/from TFieldBlob (and also to construct
* 'CoreFields' object from a 'StoredTweet' object)
*
* More details:
* - A subset of fields on the 'StoredTweet' thrift structure (2-8,19) are 'packaged' and stored
* together as a serialized TFieldBlob object under a single key in Manhattan (see TweetKeyCodec
* helper object above for more details).
*
* - To make the packing/unpacking the fields to/from TFieldBlob object, we created the following
* two helper thrift structures 'CoreFields' and 'InternalTweet'
*
* // The field Ids and types here MUST exactly match field Ids on 'StoredTweet' thrift structure.
* struct CoreFields {
* 2: optional i64 user_id
* ...
* 8: optional i64 contributor_id
* ...
* 19: optional StoredQuotedTweet stored_quoted_tweet
*
* }
*
* // The field id of core fields MUST be "1"
* struct InternalTweet {
* 1: CoreFields coreFields
* }
*
* - Given the above two structures, packing/unpacking fields (2-8,19) on StoredTweet object into a TFieldBlob
* becomes very trivial:
* For packing:
* (i) Copy fields (2-8,19) from StoredTweet object to a new CoreFields object
* (ii) Create a new InternalTweet object with the 'CoreFields' object constructed in step (i) above
* (iii) Extract field "1" as a TFieldBlob from InternalField (by calling the scrooge generated "getFieldBlob(1)"
* function on the InternalField objecton
*
* For unpacking:
* (i) Create an empty 'InternalField' object
* (ii) Call scrooge-generated 'setField' by passing the tFieldBlob blob (created by packing steps above)
* (iii) Doing step (ii) above will create a hydrated 'CoreField' object that can be accessed by 'coreFields'
* member of 'InternalTweet' object.
*/
object CoreFieldsCodec {
val coreFieldIds: Seq[FieldId] = CoreFields.metaData.fields.map(_.id)
// "Pack" the core fields i.e converts 'CoreFields' object to "packed" tFieldBlob (See description
// above for more details)
def toTFieldBlob(coreFields: CoreFields): TFieldBlob = {
InternalTweet(Some(coreFields)).getFieldBlob(TweetFields.coreFieldsIdInInternalTweet).get
}
// "Unpack" the core fields from a packed TFieldBlob into a CoreFields object (see description above for
// more details)
def fromTFieldBlob(tFieldBlob: TFieldBlob): CoreFields = {
InternalTweet().setField(tFieldBlob).coreFields.get
}
// "Unpack" the core fields from a packed TFieldBlob into a Map of core-fieldId-> TFieldBlob
def unpackFields(tFieldBlob: TFieldBlob): Map[Short, TFieldBlob] =
fromTFieldBlob(tFieldBlob).getFieldBlobs(coreFieldIds)
// Create a 'CoreFields' thrift object from 'Tweet' thrift object.
def fromTweet(tweet: StoredTweet): CoreFields = {
// As mentioned above, the field ids and types on the 'CoreFields' struct exactly match the
// corresponding fields on StoredTweet structure. So it is safe to call .getField() on Tweet object and
// and pass the returned tFleldBlob a 'setField' on 'CoreFields' object.
coreFieldIds.foldLeft(CoreFields()) {
case (core, fieldId) =>
tweet.getFieldBlob(fieldId) match {
case None => core
case Some(tFieldBlob) => core.setField(tFieldBlob)
}
}
}
}
/**
* Helper object to convert ManhattanException to FieldResponseCode thrift object
*/
object FieldResponseCodeCodec {
import FieldResponseCodec.ValueNotFoundException
def fromManhattanException(mhException: ManhattanException): FieldResponseCode = {
mhException match {
case _: ValueNotFoundException => FieldResponseCode.ValueNotFound
case _: InternalErrorManhattanException => FieldResponseCode.Error
case _: InvalidRequestManhattanException => FieldResponseCode.InvalidRequest
case _: DeniedManhattanException => FieldResponseCode.Error
case _: UnsatisfiableManhattanException => FieldResponseCode.Error
case _: TimeoutManhattanException => FieldResponseCode.Timeout
}
}
}
/**
* Helper object to construct FieldResponse thrift object from an Exception.
* This is typically called to convert 'ManhattanException' object to 'FieldResponse' thrift object
*/
object FieldResponseCodec {
class ValueNotFoundException extends ManhattanException("Value not found!") with NoStackTrace
private[storage] val NotFound = new ValueNotFoundException
def fromThrowable(e: Throwable, additionalMsg: Option[String] = None): FieldResponse = {
val (respCode, errMsg) = e match {
case mhException: ManhattanException =>
(FieldResponseCodeCodec.fromManhattanException(mhException), mhException.getMessage)
case _ => (FieldResponseCode.Error, e.getMessage)
}
val respMsg = additionalMsg.map(_ + ". " + errMsg).orElse(Some(errMsg.toString))
FieldResponse(respCode, respMsg)
}
}

View file

@ -0,0 +1,67 @@
package com.twitter.tweetypie.storage
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.stitch.Stitch
import com.twitter.storage.client.manhattan.kv.DeniedManhattanException
import com.twitter.tweetypie.storage.TweetUtils._
import com.twitter.util.Throw
import com.twitter.util.Time
object DeleteAdditionalFieldsHandler {
def apply(
delete: ManhattanOperations.Delete,
stats: StatsReceiver
): TweetStorageClient.DeleteAdditionalFields =
(unfilteredTweetIds: Seq[TweetId], additionalFields: Seq[Field]) => {
val tweetIds = unfilteredTweetIds.filter(_ > 0)
val additionalFieldIds = additionalFields.map(_.id)
require(additionalFields.nonEmpty, "Additional fields to delete cannot be empty")
require(
additionalFieldIds.min >= TweetFields.firstAdditionalFieldId,
s"Additional fields $additionalFields must be in additional field range (>= ${TweetFields.firstAdditionalFieldId})"
)
Stats.addWidthStat("deleteAdditionalFields", "tweetIds", tweetIds.size, stats)
Stats.addWidthStat(
"deleteAdditionalFields",
"additionalFieldIds",
additionalFieldIds.size,
stats
)
Stats.updatePerFieldQpsCounters(
"deleteAdditionalFields",
additionalFieldIds,
tweetIds.size,
stats
)
val mhTimestamp = Time.now
val stitches = tweetIds.map { tweetId =>
val (fieldIds, mhKeysToDelete) =
additionalFieldIds.map { fieldId =>
(fieldId, TweetKey.additionalFieldsKey(tweetId, fieldId))
}.unzip
val deletionStitches = mhKeysToDelete.map { mhKeyToDelete =>
delete(mhKeyToDelete, Some(mhTimestamp)).liftToTry
}
Stitch.collect(deletionStitches).map { responsesTries =>
val wasRateLimited = responsesTries.exists {
case Throw(e: DeniedManhattanException) => true
case _ => false
}
val resultsPerTweet = fieldIds.zip(responsesTries).toMap
if (wasRateLimited) {
buildTweetOverCapacityResponse("deleteAdditionalFields", tweetId, resultsPerTweet)
} else {
buildTweetResponse("deleteAdditionalFields", tweetId, resultsPerTweet)
}
}
}
Stitch.collect(stitches)
}
}

View file

@ -0,0 +1,41 @@
package com.twitter.tweetypie.storage
import com.twitter.tweetypie.additionalfields.AdditionalFields
import com.twitter.tweetypie.storage_internal.thriftscala.StoredTweet
import com.twitter.tweetypie.thriftscala.{Tweet => TpTweet}
/**
* A field of the stored version of a tweet to read, update, or delete.
*
* There is not a one-to-one correspondence between the fields ids of
* [[com.twitter.tweetypie.thriftscala.Tweet]] and
* [[com.twitter.tweetypie.storage_internal.thriftscala.StoredTweet]]. For example, in StoredTweet,
* the nsfwUser property is field 11; in Tweet, it is a property of the coreData struct in field 2.
* To circumvent the confusion of using one set of field ids or the other, callers use instances of
* [[Field]] to reference the part of the object to modify.
*/
class Field private[storage] (val id: Short) extends AnyVal {
override def toString: String = id.toString
}
/**
* NOTE: Make sure `AllUpdatableCompiledFields` is kept up to date when adding any new field
*/
object Field {
import AdditionalFields.isAdditionalFieldId
val Geo: Field = new Field(StoredTweet.GeoField.id)
val HasTakedown: Field = new Field(StoredTweet.HasTakedownField.id)
val NsfwUser: Field = new Field(StoredTweet.NsfwUserField.id)
val NsfwAdmin: Field = new Field(StoredTweet.NsfwAdminField.id)
val TweetypieOnlyTakedownCountryCodes: Field =
new Field(TpTweet.TweetypieOnlyTakedownCountryCodesField.id)
val TweetypieOnlyTakedownReasons: Field =
new Field(TpTweet.TweetypieOnlyTakedownReasonsField.id)
val AllUpdatableCompiledFields: Set[Field] = Set(Geo, HasTakedown, NsfwUser, NsfwAdmin)
def additionalField(id: Short): Field = {
require(isAdditionalFieldId(id), "field id must be in the additional field range")
new Field(id)
}
}

View file

@ -0,0 +1,150 @@
package com.twitter.tweetypie.storage
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.stitch.Stitch
import com.twitter.storage.client.manhattan.kv.DeniedManhattanException
import com.twitter.tweetypie.storage.Response.TweetResponseCode
import com.twitter.tweetypie.storage.TweetUtils._
import com.twitter.tweetypie.storage_internal.thriftscala.StoredTweet
import com.twitter.tweetypie.thriftscala.DeletedTweet
import scala.util.control.NonFatal
sealed trait DeleteState
object DeleteState {
/**
* This tweet is deleted but has not been permanently deleted from Manhattan. Tweets in this state
* may be undeleted.
*/
case object SoftDeleted extends DeleteState
/**
* This tweet is deleted after being bounced for violating the Twitter Rules but has not been
* permanently deleted from Manhattan. Tweets in this state may NOT be undeleted.
*/
case object BounceDeleted extends DeleteState
/**
* This tweet has been permanently deleted from Manhattan.
*/
case object HardDeleted extends DeleteState
/**
* There is no data in Manhattan to distinguish this tweet id from one that never existed.
*/
case object NotFound extends DeleteState
/**
* This tweet exists and is not in a deleted state.
*/
case object NotDeleted extends DeleteState
}
case class DeletedTweetResponse(
tweetId: TweetId,
overallResponse: TweetResponseCode,
deleteState: DeleteState,
tweet: Option[DeletedTweet])
object GetDeletedTweetsHandler {
def apply(
read: ManhattanOperations.Read,
stats: StatsReceiver
): TweetStorageClient.GetDeletedTweets =
(unfilteredTweetIds: Seq[TweetId]) => {
val tweetIds = unfilteredTweetIds.filter(_ > 0)
Stats.addWidthStat("getDeletedTweets", "tweetIds", tweetIds.size, stats)
val stitches = tweetIds.map { tweetId =>
read(tweetId)
.map { mhRecords =>
val storedTweet = buildStoredTweet(tweetId, mhRecords)
TweetStateRecord.mostRecent(mhRecords) match {
case Some(m: TweetStateRecord.SoftDeleted) => softDeleted(m, storedTweet)
case Some(m: TweetStateRecord.BounceDeleted) => bounceDeleted(m, storedTweet)
case Some(m: TweetStateRecord.HardDeleted) => hardDeleted(m, storedTweet)
case _ if storedTweet.getFieldBlobs(expectedFields).isEmpty => notFound(tweetId)
case _ => notDeleted(tweetId, storedTweet)
}
}
.handle {
case _: DeniedManhattanException =>
DeletedTweetResponse(
tweetId,
TweetResponseCode.OverCapacity,
DeleteState.NotFound,
None
)
case NonFatal(ex) =>
TweetUtils.log.warning(
ex,
s"Unhandled exception in GetDeletedTweetsHandler for tweetId: $tweetId"
)
DeletedTweetResponse(tweetId, TweetResponseCode.Failure, DeleteState.NotFound, None)
}
}
Stitch.collect(stitches)
}
private def notFound(tweetId: TweetId) =
DeletedTweetResponse(
tweetId = tweetId,
overallResponse = TweetResponseCode.Success,
deleteState = DeleteState.NotFound,
tweet = None
)
private def softDeleted(record: TweetStateRecord.SoftDeleted, storedTweet: StoredTweet) =
DeletedTweetResponse(
record.tweetId,
TweetResponseCode.Success,
DeleteState.SoftDeleted,
Some(
StorageConversions
.toDeletedTweet(storedTweet)
.copy(deletedAtMsec = Some(record.createdAt))
)
)
private def bounceDeleted(record: TweetStateRecord.BounceDeleted, storedTweet: StoredTweet) =
DeletedTweetResponse(
record.tweetId,
TweetResponseCode.Success,
DeleteState.BounceDeleted,
Some(
StorageConversions
.toDeletedTweet(storedTweet)
.copy(deletedAtMsec = Some(record.createdAt))
)
)
private def hardDeleted(record: TweetStateRecord.HardDeleted, storedTweet: StoredTweet) =
DeletedTweetResponse(
record.tweetId,
TweetResponseCode.Success,
DeleteState.HardDeleted,
Some(
StorageConversions
.toDeletedTweet(storedTweet)
.copy(
hardDeletedAtMsec = Some(record.createdAt),
deletedAtMsec = Some(record.deletedAt)
)
)
)
/**
* notDeleted returns a tweet to simplify tweetypie.handler.UndeleteTweetHandler
*/
private def notDeleted(tweetId: TweetId, storedTweet: StoredTweet) =
DeletedTweetResponse(
tweetId = tweetId,
overallResponse = TweetResponseCode.Success,
deleteState = DeleteState.NotDeleted,
tweet = Some(StorageConversions.toDeletedTweet(storedTweet))
)
}

View file

@ -0,0 +1,126 @@
package com.twitter.tweetypie.storage
import com.twitter.conversions.DurationOps._
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.stitch.Stitch
import com.twitter.stitch.StitchSeqGroup
import com.twitter.tweetypie.storage.TweetStorageClient.GetStoredTweet
import com.twitter.tweetypie.storage.TweetStorageClient.GetStoredTweet.Error
import com.twitter.tweetypie.storage.TweetStorageClient.GetStoredTweet.Response._
import com.twitter.tweetypie.storage.TweetUtils._
import com.twitter.tweetypie.thriftscala.Tweet
import com.twitter.util.Time
import com.twitter.util.Try
import scala.collection.mutable
object GetStoredTweetHandler {
private[this] object DeletedState {
def unapply(stateRecord: Option[TweetStateRecord]): Option[TweetStateRecord] =
stateRecord match {
case state @ (Some(_: TweetStateRecord.SoftDeleted) | Some(
_: TweetStateRecord.HardDeleted) | Some(_: TweetStateRecord.BounceDeleted)) =>
state
case _ => None
}
}
private[this] def deletedAtMs(stateRecord: Option[TweetStateRecord]): Option[Long] =
stateRecord match {
case Some(d: TweetStateRecord.SoftDeleted) => Some(d.createdAt)
case Some(d: TweetStateRecord.BounceDeleted) => Some(d.createdAt)
case Some(d: TweetStateRecord.HardDeleted) => Some(d.deletedAt)
case _ => None
}
private[this] def tweetResponseFromRecords(
tweetId: TweetId,
mhRecords: Seq[TweetManhattanRecord],
statsReceiver: StatsReceiver,
): GetStoredTweet.Response = {
val errs =
mutable.Buffer[Error]()
val hasStoredTweetFields: Boolean = mhRecords.exists {
case TweetManhattanRecord(TweetKey(_, _: TweetKey.LKey.FieldKey), _) => true
case _ => false
}
val storedTweet = if (hasStoredTweetFields) {
Try(buildStoredTweet(tweetId, mhRecords, includeScrubbed = true))
.onFailure(_ => errs.append(Error.TweetIsCorrupt))
.toOption
} else {
None
}
val scrubbedFields: Set[FieldId] = extractScrubbedFields(mhRecords)
val tweet: Option[Tweet] = storedTweet.map(StorageConversions.fromStoredTweetAllowInvalid)
val stateRecords: Seq[TweetStateRecord] = TweetStateRecord.fromTweetMhRecords(mhRecords)
val tweetState: Option[TweetStateRecord] = TweetStateRecord.mostRecent(mhRecords)
storedTweet.foreach { storedTweet =>
val storedExpectedFields = storedTweet.getFieldBlobs(expectedFields)
val missingExpectedFields = expectedFields.filterNot(storedExpectedFields.contains)
if (missingExpectedFields.nonEmpty || !isValid(storedTweet)) {
errs.append(Error.TweetFieldsMissingOrInvalid)
}
val invalidScrubbedFields = storedTweet.getFieldBlobs(scrubbedFields).keys
if (invalidScrubbedFields.nonEmpty) {
errs.append(Error.ScrubbedFieldsPresent)
}
if (deletedAtMs(tweetState).exists(_ < Time.now.inMilliseconds - 14.days.inMilliseconds)) {
errs.append(Error.TweetShouldBeHardDeleted)
}
}
val err = Option(errs.toList).filter(_.nonEmpty)
(tweet, tweetState, err) match {
case (None, None, None) =>
statsReceiver.counter("not_found").incr()
NotFound(tweetId)
case (None, Some(tweetState: TweetStateRecord.HardDeleted), None) =>
statsReceiver.counter("hard_deleted").incr()
HardDeleted(tweetId, Some(tweetState), stateRecords, scrubbedFields)
case (None, _, Some(errs)) =>
statsReceiver.counter("failed").incr()
Failed(tweetId, tweetState, stateRecords, scrubbedFields, errs)
case (Some(tweet), _, Some(errs)) =>
statsReceiver.counter("found_invalid").incr()
FoundWithErrors(tweet, tweetState, stateRecords, scrubbedFields, errs)
case (Some(tweet), DeletedState(state), None) =>
statsReceiver.counter("deleted").incr()
FoundDeleted(tweet, Some(state), stateRecords, scrubbedFields)
case (Some(tweet), _, None) =>
statsReceiver.counter("found").incr()
Found(tweet, tweetState, stateRecords, scrubbedFields)
}
}
def apply(read: ManhattanOperations.Read, statsReceiver: StatsReceiver): GetStoredTweet = {
object mhGroup extends StitchSeqGroup[TweetId, Seq[TweetManhattanRecord]] {
override def run(tweetIds: Seq[TweetId]): Stitch[Seq[Seq[TweetManhattanRecord]]] = {
Stats.addWidthStat("getStoredTweet", "tweetIds", tweetIds.size, statsReceiver)
Stitch.traverse(tweetIds)(read(_))
}
}
tweetId =>
if (tweetId <= 0) {
Stitch.NotFound
} else {
Stitch
.call(tweetId, mhGroup)
.map(mhRecords =>
tweetResponseFromRecords(tweetId, mhRecords, statsReceiver.scope("getStoredTweet")))
}
}
}

View file

@ -0,0 +1,167 @@
package com.twitter.tweetypie.storage
import com.twitter.conversions.DurationOps._
import com.twitter.finagle.stats.Counter
import com.twitter.finagle.stats.NullStatsReceiver
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.logging.Logger
import com.twitter.snowflake.id.SnowflakeId
import com.twitter.stitch.Stitch
import com.twitter.stitch.StitchSeqGroup
import com.twitter.storage.client.manhattan.kv.DeniedManhattanException
import com.twitter.storage.client.manhattan.kv.ManhattanException
import com.twitter.tweetypie.storage.TweetStateRecord.BounceDeleted
import com.twitter.tweetypie.storage.TweetStateRecord.HardDeleted
import com.twitter.tweetypie.storage.TweetStateRecord.SoftDeleted
import com.twitter.tweetypie.storage.TweetStorageClient.GetTweet
import com.twitter.tweetypie.storage.TweetUtils._
import com.twitter.util.Duration
import com.twitter.util.Return
import com.twitter.util.Throw
import com.twitter.util.Time
object GetTweetHandler {
private[this] val logger = Logger(getClass)
//////////////////////////////////////////////////
// Logging racy reads for later validation.
val RacyTweetWindow: Duration = 10.seconds
/**
* If this read is soon after the tweet was created, then we would usually
* expect it to be served from cache. This early read indicates that this
* tweet is prone to consistency issues, so we log what's present in
* Manhattan at the time of the read for later analysis.
*/
private[this] def logRacyRead(tweetId: TweetId, records: Seq[TweetManhattanRecord]): Unit =
if (SnowflakeId.isSnowflakeId(tweetId)) {
val tweetAge = Time.now.since(SnowflakeId(tweetId).time)
if (tweetAge <= RacyTweetWindow) {
val sb = new StringBuilder
sb.append("racy_tweet_read\t")
.append(tweetId)
.append('\t')
.append(tweetAge.inMilliseconds) // Log the age for analysis purposes
records.foreach { rec =>
sb.append('\t')
.append(rec.lkey)
rec.value.timestamp.foreach { ts =>
// If there is a timestamp for this key, log it so that we can tell
// later on whether a value should have been present. We expect
// keys written in a single write to have the same timestamp, and
// generally, keys written in separate writes will have different
// timestamps. The timestamp value is optional in Manhattan, but
// we expect there to always be a value for the timestamp.
sb.append(':')
.append(ts.inMilliseconds)
}
}
logger.info(sb.toString)
}
}
/**
* Convert a set of records from Manhattan into a GetTweet.Response.
*/
def tweetResponseFromRecords(
tweetId: TweetId,
mhRecords: Seq[TweetManhattanRecord],
statsReceiver: StatsReceiver = NullStatsReceiver
): GetTweet.Response =
if (mhRecords.isEmpty) {
GetTweet.Response.NotFound
} else {
// If no internal fields are present or no required fields present, we consider the tweet
// as not returnable (even if some additional fields are present)
def tweetFromRecords(tweetId: TweetId, mhRecords: Seq[TweetManhattanRecord]) = {
val storedTweet = buildStoredTweet(tweetId, mhRecords)
if (storedTweet.getFieldBlobs(expectedFields).nonEmpty) {
if (isValid(storedTweet)) {
statsReceiver.counter("valid").incr()
Some(StorageConversions.fromStoredTweet(storedTweet))
} else {
log.info(s"Invalid Tweet Id: $tweetId")
statsReceiver.counter("invalid").incr()
None
}
} else {
// The Tweet contained none of the fields defined in `expectedFields`
log.info(s"Expected Fields Not Present Tweet Id: $tweetId")
statsReceiver.counter("expected_fields_not_present").incr()
None
}
}
val stateRecord = TweetStateRecord.mostRecent(mhRecords)
stateRecord match {
// some other cases don't require an attempt to construct a Tweet
case Some(_: SoftDeleted) | Some(_: HardDeleted) => GetTweet.Response.Deleted
// all other cases require an attempt to construct a Tweet, which may not be successful
case _ =>
logRacyRead(tweetId, mhRecords)
(stateRecord, tweetFromRecords(tweetId, mhRecords)) match {
// BounceDeleted contains the Tweet data so that callers can access data on the the
// tweet (e.g. hard delete daemon requires conversationId and userId. There are no
// plans for Tweetypie server to make use of the returned tweet at this time.
case (Some(_: BounceDeleted), Some(tweet)) => GetTweet.Response.BounceDeleted(tweet)
case (Some(_: BounceDeleted), None) => GetTweet.Response.Deleted
case (_, Some(tweet)) => GetTweet.Response.Found(tweet)
case _ => GetTweet.Response.NotFound
}
}
}
def apply(read: ManhattanOperations.Read, statsReceiver: StatsReceiver): GetTweet = {
object stats {
val getTweetScope = statsReceiver.scope("getTweet")
val deniedCounter: Counter = getTweetScope.counter("mh_denied")
val mhExceptionCounter: Counter = getTweetScope.counter("mh_exception")
val nonFatalExceptionCounter: Counter = getTweetScope.counter("non_fatal_exception")
val notFoundCounter: Counter = getTweetScope.counter("not_found")
}
object mhGroup extends StitchSeqGroup[TweetId, Seq[TweetManhattanRecord]] {
override def run(tweetIds: Seq[TweetId]): Stitch[Seq[Seq[TweetManhattanRecord]]] = {
Stats.addWidthStat("getTweet", "tweetIds", tweetIds.size, statsReceiver)
Stitch.traverse(tweetIds)(read(_))
}
}
tweetId =>
if (tweetId <= 0) {
Stitch.NotFound
} else {
Stitch
.call(tweetId, mhGroup)
.map(mhRecords => tweetResponseFromRecords(tweetId, mhRecords, stats.getTweetScope))
.liftToTry
.map {
case Throw(mhException: DeniedManhattanException) =>
stats.deniedCounter.incr()
Throw(RateLimited("", mhException))
// Encountered some other Manhattan error
case t @ Throw(_: ManhattanException) =>
stats.mhExceptionCounter.incr()
t
// Something else happened
case t @ Throw(ex) =>
stats.nonFatalExceptionCounter.incr()
TweetUtils.log
.warning(ex, s"Unhandled exception in GetTweetHandler for tweetId: $tweetId")
t
case r @ Return(GetTweet.Response.NotFound) =>
stats.notFoundCounter.incr()
r
case r @ Return(_) => r
}
.lowerFromTry
}
}
}

View file

@ -0,0 +1,153 @@
package com.twitter.tweetypie.storage
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.stitch.Stitch
import com.twitter.tweetypie.storage.TweetKey.LKey.ForceAddedStateKey
import com.twitter.tweetypie.storage.TweetStorageClient.HardDeleteTweet
import com.twitter.tweetypie.storage.TweetStorageClient.HardDeleteTweet.Response._
import com.twitter.tweetypie.storage.TweetUtils._
import com.twitter.util.Return
import com.twitter.util.Throw
import com.twitter.util.Time
import com.twitter.util.Try
object HardDeleteTweetHandler {
/**
* When a tweet is removed lkeys with these prefixes will be deleted permanently.
*/
private[storage] def isKeyToBeDeleted(key: TweetKey): Boolean =
key.lKey match {
case (TweetKey.LKey.CoreFieldsKey | TweetKey.LKey.InternalFieldsKey(_) |
TweetKey.LKey.AdditionalFieldsKey(_) | TweetKey.LKey.SoftDeletionStateKey |
TweetKey.LKey.BounceDeletionStateKey | TweetKey.LKey.UnDeletionStateKey |
TweetKey.LKey.ForceAddedStateKey) =>
true
case _ => false
}
/**
* When hard deleting, there are two actions, writing the record and
* removing the tweet data. If we are performing any action, we will
* always try to remove the tweet data. If the tweet does not yet have a
* hard deletion record, then we will need to write one. This method
* returns the HardDeleted record if it needs to be written, and None
* if it has already been written.
*
* If the tweet is not in a deleted state we signal this with a
* Throw(NotDeleted).
*/
private[storage] def getHardDeleteStateRecord(
tweetId: TweetId,
records: Seq[TweetManhattanRecord],
mhTimestamp: Time,
stats: StatsReceiver
): Try[Option[TweetStateRecord.HardDeleted]] = {
val mostRecent = TweetStateRecord.mostRecent(records)
val currentStateStr = mostRecent.map(_.name).getOrElse("no_tweet_state_record")
stats.counter(currentStateStr).incr()
mostRecent match {
case Some(
record @ (TweetStateRecord.SoftDeleted(_, _) | TweetStateRecord.BounceDeleted(_, _))) =>
Return(
Some(
TweetStateRecord.HardDeleted(
tweetId = tweetId,
// createdAt is the hard deletion timestamp when dealing with hard deletes in Manhattan
createdAt = mhTimestamp.inMillis,
// deletedAt is the soft deletion timestamp when dealing with hard deletes in Manhattan
deletedAt = record.createdAt
)
)
)
case Some(_: TweetStateRecord.HardDeleted) =>
Return(None)
case Some(_: TweetStateRecord.ForceAdded) =>
Throw(NotDeleted(tweetId, Some(ForceAddedStateKey)))
case Some(_: TweetStateRecord.Undeleted) =>
Throw(NotDeleted(tweetId, Some(TweetKey.LKey.UnDeletionStateKey)))
case None =>
Throw(NotDeleted(tweetId, None))
}
}
/**
* This handler returns HardDeleteTweet.Response.Deleted if data associated with the tweet is deleted,
* either as a result of this request or a previous one.
*
* The most recently added record determines the tweet's state. This method will only delete data
* for tweets in the soft-delete or hard-delete state. (Calling hardDeleteTweet for tweets that have
* already been hard-deleted will remove any lkeys that may not have been deleted previously).
*/
def apply(
read: ManhattanOperations.Read,
insert: ManhattanOperations.Insert,
delete: ManhattanOperations.Delete,
scribe: Scribe,
stats: StatsReceiver
): TweetId => Stitch[HardDeleteTweet.Response] = {
val hardDeleteStats = stats.scope("hardDeleteTweet")
val hardDeleteTweetCancelled = hardDeleteStats.counter("cancelled")
val beforeStateStats = hardDeleteStats.scope("before_state")
def removeRecords(keys: Seq[TweetKey], mhTimestamp: Time): Stitch[Unit] =
Stitch
.collect(keys.map(key => delete(key, Some(mhTimestamp)).liftToTry))
.map(collectWithRateLimitCheck)
.lowerFromTry
def writeRecord(record: Option[TweetStateRecord.HardDeleted]): Stitch[Unit] =
record match {
case Some(r) =>
insert(r.toTweetMhRecord).onSuccess { _ =>
scribe.logRemoved(
r.tweetId,
Time.fromMilliseconds(r.createdAt),
isSoftDeleted = false
)
}
case None => Stitch.Unit
}
tweetId =>
read(tweetId)
.flatMap { records =>
val hardDeletionTimestamp = Time.now
val keysToBeDeleted: Seq[TweetKey] = records.map(_.key).filter(isKeyToBeDeleted)
getHardDeleteStateRecord(
tweetId,
records,
hardDeletionTimestamp,
beforeStateStats) match {
case Return(record) =>
Stitch
.join(
writeRecord(record),
removeRecords(keysToBeDeleted, hardDeletionTimestamp)
).map(_ =>
// If the tweetId is non-snowflake and has previously been hard deleted
// there will be no coreData record to fall back on to get the tweet
// creation time and createdAtMillis will be None.
Deleted(
// deletedAtMillis: when the tweet was hard deleted
deletedAtMillis = Some(hardDeletionTimestamp.inMillis),
// createdAtMillis: when the tweet itself was created
// (as opposed to when the deletion record was created)
createdAtMillis =
TweetUtils.creationTimeFromTweetIdOrMHRecords(tweetId, records)
))
case Throw(notDeleted: NotDeleted) =>
hardDeleteTweetCancelled.incr()
Stitch.value(notDeleted)
case Throw(e) => Stitch.exception(e) // this should never happen
}
}
}
}

View file

@ -0,0 +1,228 @@
package com.twitter.tweetypie.storage
import com.google.common.base.CaseFormat
import com.twitter.conversions.DurationOps._
import com.twitter.finagle.mtls.authentication.ServiceIdentifier
import com.twitter.scrooge.TFieldBlob
import com.twitter.scrooge.ThriftStructFieldInfo
import com.twitter.stitch.Stitch
import com.twitter.storage.client.manhattan.kv._
import com.twitter.tweetypie.additionalfields.AdditionalFields
import com.twitter.tweetypie.storage.ManhattanOperations.Read
import com.twitter.tweetypie.storage.TweetUtils._
import com.twitter.tweetypie.storage_internal.thriftscala.StoredTweet
import com.twitter.tweetypie.thriftscala.{Tweet => TweetypieTweet}
import com.twitter.util.Duration
import com.twitter.util.Future
import com.twitter.util.Return
import com.twitter.util.Throw
import diffshow.Container
import diffshow.DiffShow
import diffshow.Expr
import org.apache.commons.codec.binary.Base64
import scala.util.Try
import shapeless.Cached
import shapeless.Strict
// This class is used by the Tweetypie Console to inspect tweet field content in Manhattan
class InspectFields(svcIdentifier: ServiceIdentifier) {
val mhApplicationId = "tbird_mh"
val mhDatasetName = "tbird_mh"
val mhDestinationName = "/s/manhattan/cylon.native-thrift"
val mhTimeout: Duration = 5000.milliseconds
val localMhEndpoint: ManhattanKVEndpoint =
ManhattanKVEndpointBuilder(
ManhattanKVClient(
mhApplicationId,
mhDestinationName,
ManhattanKVClientMtlsParams(svcIdentifier)))
.defaultGuarantee(Guarantee.SoftDcReadMyWrites)
.defaultMaxTimeout(mhTimeout)
.build()
val readOperation: Read = (new ManhattanOperations(mhDatasetName, localMhEndpoint)).read
def lookup(tweetId: Long): Future[String] = {
val result = readOperation(tweetId).liftToTry.map {
case Return(mhRecords) =>
prettyPrintManhattanRecords(tweetId, TweetKey.padTweetIdStr(tweetId), mhRecords)
case Throw(e) => e.toString
}
Stitch.run(result)
}
def storedTweet(tweetId: Long): Future[StoredTweet] = {
val result = readOperation(tweetId).liftToTry.map {
case Return(mhRecords) =>
buildStoredTweet(tweetId, mhRecords)
case Throw(e) =>
throw e
}
Stitch.run(result)
}
private[this] def prettyPrintManhattanRecords(
tweetId: Long,
pkey: String,
mhRecords: Seq[TweetManhattanRecord]
): String = {
if (mhRecords.isEmpty) {
"Not Found"
} else {
val formattedRecords = getFormattedManhattanRecords(tweetId, mhRecords)
val keyFieldWidth = formattedRecords.map(_.key.length).max + 2
val fieldNameFieldWidth = formattedRecords.map(_.fieldName.length).max + 2
val formatString = s" %-${keyFieldWidth}s %-${fieldNameFieldWidth}s %s"
val recordsString =
formattedRecords
.map { record =>
val content = record.content.replaceAll("\n", "\n" + formatString.format("", "", ""))
formatString.format(record.key, record.fieldName, content)
}
.mkString("\n")
"/tbird_mh/" + pkey + "/" + "\n" + recordsString
}
}
private[this] def getFormattedManhattanRecords(
tweetId: Long,
mhRecords: Seq[TweetManhattanRecord]
): Seq[FormattedManhattanRecord] = {
val storedTweet = buildStoredTweet(tweetId, mhRecords).copy(updatedAt = None)
val tweetypieTweet: Option[TweetypieTweet] =
Try(StorageConversions.fromStoredTweet(storedTweet)).toOption
val blobMap: Map[String, TFieldBlob] = getStoredTweetBlobs(mhRecords).map { blob =>
getFieldName(blob.field.id) -> blob
}.toMap
mhRecords
.map {
case TweetManhattanRecord(fullKey, mhValue) =>
FormattedManhattanRecord(
key = fullKey.lKey.toString,
fieldName = getFieldName(fullKey.lKey),
content = prettyPrintManhattanValue(
fullKey.lKey,
mhValue,
storedTweet,
tweetypieTweet,
tweetId,
blobMap
)
)
}
.sortBy(_.key.replace("external", "xternal")) // sort by key, with internal first
}
private[this] def getFieldNameFromThrift(
fieldId: Short,
fieldInfos: List[ThriftStructFieldInfo]
): String =
fieldInfos
.find(info => info.tfield.id == fieldId)
.map(_.tfield.name)
.getOrElse("<UNKNOWN FIELD>")
private[this] def isLkeyScrubbedField(lkey: String): Boolean =
lkey.split("/")(1) == "scrubbed_fields"
private[this] def getFieldName(lkey: TweetKey.LKey): String =
lkey match {
case fieldKey: TweetKey.LKey.FieldKey => getFieldName(fieldKey.fieldId)
case _ => ""
}
private[this] def getFieldName(fieldId: Short): String =
if (fieldId == 1) {
"core_fields"
} else if (AdditionalFields.isAdditionalFieldId(fieldId)) {
getFieldNameFromThrift(fieldId, TweetypieTweet.fieldInfos)
} else {
getFieldNameFromThrift(fieldId, StoredTweet.fieldInfos)
}
private[this] def prettyPrintManhattanValue(
lkey: TweetKey.LKey,
mhValue: TweetManhattanValue,
storedTweet: StoredTweet,
tweetypieTweet: Option[TweetypieTweet],
tweetId: Long,
tfieldBlobs: Map[String, TFieldBlob]
): String = {
val decoded = lkey match {
case _: TweetKey.LKey.MetadataKey =>
decodeMetadata(mhValue)
case fieldKey: TweetKey.LKey.FieldKey =>
tfieldBlobs
.get(getFieldName(fieldKey.fieldId))
.map(blob => decodeField(tweetId, blob, storedTweet, tweetypieTweet))
case _ =>
None
}
decoded.getOrElse { // If all else fails, encode the data as a base64 string
val contents = mhValue.contents.array
if (contents.isEmpty) {
"<NO DATA>"
} else {
Base64.encodeBase64String(contents)
}
}
}
private[this] def decodeMetadata(mhValue: TweetManhattanValue): Option[String] = {
val byteArray = ByteArrayCodec.fromByteBuffer(mhValue.contents)
Try(Json.decode(byteArray).toString).toOption
}
private[this] def decodeField(
tweetId: Long,
blob: TFieldBlob,
storedTweet: StoredTweet,
tweetypieTweet: Option[TweetypieTweet]
): String = {
val fieldId = blob.field.id
if (fieldId == 1) {
coreFields(storedTweet)
} else if (AdditionalFields.isAdditionalFieldId(fieldId)) {
decodeTweetWithOneField(TweetypieTweet(tweetId).setField(blob))
} else {
decodeTweetWithOneField(StoredTweet(tweetId).setField(blob))
}
}
// Takes a Tweet or StoredTweet with a single field set and returns the value of that field
private[this] def decodeTweetWithOneField[T](
tweetWithOneField: T
)(
implicit ev: Cached[Strict[DiffShow[T]]]
): String = {
val config = diffshow.Config(hideFieldWithEmptyVal = true)
val tree: Expr = config.transform(DiffShow.show(tweetWithOneField))
// matches a Tweet or StoredTweet with two values, the first being the id
val value = tree.transform {
case Container(_, List(diffshow.Field("id", _), diffshow.Field(_, value))) => value
}
config.exprPrinter.apply(value, width = 80).render
}
private[this] def coreFields(storedTweet: StoredTweet): String =
diffshow.show(CoreFieldsCodec.fromTweet(storedTweet), hideFieldWithEmptyVal = true)
private[this] def toCamelCase(s: String): String =
CaseFormat.LOWER_UNDERSCORE.to(CaseFormat.LOWER_CAMEL, s)
}
case class FormattedManhattanRecord(key: String, fieldName: String, content: String)

View file

@ -0,0 +1,17 @@
package com.twitter.tweetypie.storage
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
object Json {
val TimestampKey = "timestamp"
val SoftDeleteTimestampKey = "softdelete_timestamp"
private val mapper = new ObjectMapper
mapper.registerModule(DefaultScalaModule)
def encode(m: Map[String, Any]): Array[Byte] = mapper.writeValueAsBytes(m)
def decode(arr: Array[Byte]): Map[String, Any] =
mapper.readValue[Map[String, Any]](arr, classOf[Map[String, Any]])
}

View file

@ -0,0 +1,103 @@
package com.twitter.tweetypie.storage
import com.twitter.bijection.Injection
import com.twitter.io.Buf
import com.twitter.stitch.Stitch
import com.twitter.storage.client.manhattan.bijections.Bijections.BufInjection
import com.twitter.storage.client.manhattan.kv.ManhattanKVEndpoint
import com.twitter.storage.client.manhattan.kv.impl.DescriptorP1L1
import com.twitter.storage.client.manhattan.kv.impl.Component
import com.twitter.storage.client.manhattan.kv.{impl => mh}
import com.twitter.storage.client.manhattan.bijections.Bijections.StringInjection
import com.twitter.util.Time
import java.nio.ByteBuffer
import scala.util.control.NonFatal
case class TweetManhattanRecord(key: TweetKey, value: TweetManhattanValue) {
def pkey: TweetId = key.tweetId
def lkey: TweetKey.LKey = key.lKey
/**
* Produces a representation that is human-readable, but contains
* all of the information from the record. It is not intended for
* producing machine-readable values.
*
* This conversion is relatively expensive, so beware of using it in
* hot code paths.
*/
override def toString: String = {
val valueString =
try {
key.lKey match {
case _: TweetKey.LKey.MetadataKey =>
StringCodec.fromByteBuffer(value.contents)
case _: TweetKey.LKey.FieldKey =>
val tFieldBlob = TFieldBlobCodec.fromByteBuffer(value.contents)
s"TFieldBlob(${tFieldBlob.field}, 0x${Buf.slowHexString(tFieldBlob.content)})"
case TweetKey.LKey.Unknown(_) =>
"0x" + Buf.slowHexString(Buf.ByteBuffer.Shared(value.contents))
}
} catch {
case NonFatal(e) =>
val hexValue = Buf.slowHexString(Buf.ByteBuffer.Shared(value.contents))
s"0x$hexValue (failed to decode due to $e)"
}
s"$key => ${value.copy(contents = valueString)}"
}
}
object ManhattanOperations {
type Read = TweetId => Stitch[Seq[TweetManhattanRecord]]
type Insert = TweetManhattanRecord => Stitch[Unit]
type Delete = (TweetKey, Option[Time]) => Stitch[Unit]
type DeleteRange = TweetId => Stitch[Unit]
object PkeyInjection extends Injection[TweetId, String] {
override def apply(tweetId: TweetId): String = TweetKey.padTweetIdStr(tweetId)
override def invert(str: String): scala.util.Try[TweetId] = scala.util.Try(str.toLong)
}
case class InvalidLkey(lkeyStr: String) extends Exception
object LkeyInjection extends Injection[TweetKey.LKey, String] {
override def apply(lkey: TweetKey.LKey): String = lkey.toString
override def invert(str: String): scala.util.Try[TweetKey.LKey] =
scala.util.Success(TweetKey.LKey.fromString(str))
}
val KeyDescriptor: DescriptorP1L1.EmptyKey[TweetId, TweetKey.LKey] =
mh.KeyDescriptor(
Component(PkeyInjection.andThen(StringInjection)),
Component(LkeyInjection.andThen(StringInjection))
)
val ValueDescriptor: mh.ValueDescriptor.EmptyValue[ByteBuffer] = mh.ValueDescriptor(BufInjection)
}
class ManhattanOperations(dataset: String, mhEndpoint: ManhattanKVEndpoint) {
import ManhattanOperations._
private[this] def pkey(tweetId: TweetId) = KeyDescriptor.withDataset(dataset).withPkey(tweetId)
def read: Read = { tweetId =>
mhEndpoint.slice(pkey(tweetId).under(), ValueDescriptor).map { mhData =>
mhData.map {
case (key, value) => TweetManhattanRecord(TweetKey(key.pkey, key.lkey), value)
}
}
}
def insert: Insert =
record => {
val mhKey = pkey(record.key.tweetId).withLkey(record.key.lKey)
mhEndpoint.insert(mhKey, ValueDescriptor.withValue(record.value))
}
def delete: Delete = (key, time) => mhEndpoint.delete(pkey(key.tweetId).withLkey(key.lKey), time)
def deleteRange: DeleteRange =
tweetId => mhEndpoint.deleteRange(KeyDescriptor.withDataset(dataset).withPkey(tweetId).under())
}

View file

@ -0,0 +1,451 @@
package com.twitter.tweetypie.storage
import com.twitter.conversions.DurationOps._
import com.twitter.finagle.mtls.authentication.EmptyServiceIdentifier
import com.twitter.finagle.mtls.authentication.ServiceIdentifier
import com.twitter.finagle.ssl.OpportunisticTls
import com.twitter.finagle.stats.NullStatsReceiver
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.logging.BareFormatter
import com.twitter.logging.Level
import com.twitter.logging.ScribeHandler
import com.twitter.logging._
import com.twitter.stitch.Stitch
import com.twitter.storage.client.manhattan.bijections.Bijections._
import com.twitter.storage.client.manhattan.kv._
import com.twitter.storage.client.manhattan.kv.impl.ValueDescriptor
import com.twitter.tweetypie.client_id.ClientIdHelper
import com.twitter.tweetypie.storage.Scribe.ScribeHandlerFactory
import com.twitter.tweetypie.storage.TweetStorageClient.BounceDelete
import com.twitter.tweetypie.storage.TweetStorageClient.GetTweet
import com.twitter.tweetypie.storage.TweetStorageClient.HardDeleteTweet
import com.twitter.tweetypie.thriftscala.Tweet
import com.twitter.tweetypie.util.StitchUtils
import com.twitter.util.Duration
import com.twitter.util.Return
import com.twitter.util.Throw
import scala.util.Random
object ManhattanTweetStorageClient {
object Config {
/**
* The Manhattan dataset where tweets are stored is not externally
* configurable because writing tweets to a non-production dataset
* requires great care. Staging instances using a different dataset will
* write tweets to a non-production store, but will publish events, log to
* HDFS, and cache data referencing tweets in that store which are not
* accessible by the rest of the production cluster.
*
* In a completely isolated environment it should be safe to write to
* other datasets for testing purposes.
*/
val Dataset = "tbird_mh"
/**
* Once a tweet has been deleted it can only be undeleted within this time
* window, after which [[UndeleteHandler]] will return an error on
* undelete attempts.
*/
val UndeleteWindowHours = 240
/**
* Default label used for underlying Manhattan Thrift client metrics
*
* The finagle client metrics will be exported at clnt/:label.
*/
val ThriftClientLabel = "mh_cylon"
/**
* Return the corresponding Wily path for the Cylon cluster in the "other" DC
*/
def remoteDestination(zone: String): String =
s"/srv#/prod/${remoteZone(zone)}/manhattan/cylon.native-thrift"
private def remoteZone(zone: String) = zone match {
case "pdxa" => "atla"
case "atla" | "localhost" => "pdxa"
case _ =>
throw new IllegalArgumentException(s"Cannot configure remote DC for unknown zone '$zone'")
}
}
/**
* @param applicationId Manhattan application id used for quota accounting
* @param localDestination Wily path to local Manhattan cluster
* @param localTimeout Overall timeout (including retries) for all reads/writes to local cluster
* @param remoteDestination Wily path to remote Manhattan cluster, used for undelete and force add
* @param remoteTimeout Overall timeout (including retries) for all reads/writes to remote cluster
* @param undeleteWindowHours Amount of time during which a deleted tweet can be undeleted
* @param thriftClientLabel Label used to scope stats for Manhattan Thrift client
* @param maxRequestsPerBatch Configure the Stitch RequestGroup.Generator batch size
* @param serviceIdentifier The ServiceIdentifier to use when making connections to a Manhattan cluster
* @param opportunisticTlsLevel The level to use for opportunistic TLS for connections to the Manhattan cluster
*/
case class Config(
applicationId: String,
localDestination: String,
localTimeout: Duration,
remoteDestination: String,
remoteTimeout: Duration,
undeleteWindowHours: Int = Config.UndeleteWindowHours,
thriftClientLabel: String = Config.ThriftClientLabel,
maxRequestsPerBatch: Int = Int.MaxValue,
serviceIdentifier: ServiceIdentifier,
opportunisticTlsLevel: OpportunisticTls.Level)
/**
* Sanitizes the input for APIs which take in a (Tweet, Seq[Field]) as input.
*
* NOTE: This function only applies sanity checks which are common to
* all APIs which take in a (Tweet, Seq[Field]) as input. API specific
* checks are not covered here.
*
* @param apiStitch the backing API call
* @tparam T the output type of the backing API call
* @return a stitch function which does some basic input sanity checking
*/
private[storage] def sanitizeTweetFields[T](
apiStitch: (Tweet, Seq[Field]) => Stitch[T]
): (Tweet, Seq[Field]) => Stitch[T] =
(tweet, fields) => {
require(fields.forall(_.id > 0), s"Field ids ${fields} are not positive numbers")
apiStitch(tweet, fields)
}
// Returns a handler that asynchronously logs messages to Scribe using the BareFormatter which
// logs just the message without any additional metadata
def scribeHandler(categoryName: String): HandlerFactory =
ScribeHandler(
formatter = BareFormatter,
maxMessagesPerTransaction = 100,
category = categoryName,
level = Some(Level.TRACE)
)
/**
* A Config appropriate for interactive sessions and scripts.
*/
def develConfig(): Config =
Config(
applicationId = Option(System.getenv("USER")).getOrElse("<unknown>") + ".devel",
localDestination = "/s/manhattan/cylon.native-thrift",
localTimeout = 10.seconds,
remoteDestination = "/s/manhattan/cylon.native-thrift",
remoteTimeout = 10.seconds,
undeleteWindowHours = Config.UndeleteWindowHours,
thriftClientLabel = Config.ThriftClientLabel,
maxRequestsPerBatch = Int.MaxValue,
serviceIdentifier = ServiceIdentifier(System.getenv("USER"), "tweetypie", "devel", "local"),
opportunisticTlsLevel = OpportunisticTls.Required
)
/**
* Build a Manhattan tweet storage client for use in interactive
* sessions and scripts.
*/
def devel(): TweetStorageClient =
new ManhattanTweetStorageClient(
develConfig(),
NullStatsReceiver,
ClientIdHelper.default,
)
}
class ManhattanTweetStorageClient(
config: ManhattanTweetStorageClient.Config,
statsReceiver: StatsReceiver,
private val clientIdHelper: ClientIdHelper)
extends TweetStorageClient {
import ManhattanTweetStorageClient._
lazy val scribeHandlerFactory: ScribeHandlerFactory = scribeHandler _
val scribe: Scribe = new Scribe(scribeHandlerFactory, statsReceiver)
def mkClient(
dest: String,
label: String
): ManhattanKVClient = {
val mhMtlsParams =
if (config.serviceIdentifier == EmptyServiceIdentifier) NoMtlsParams
else
ManhattanKVClientMtlsParams(
serviceIdentifier = config.serviceIdentifier,
opportunisticTls = config.opportunisticTlsLevel
)
new ManhattanKVClient(
config.applicationId,
dest,
mhMtlsParams,
label,
Seq(Experiments.ApertureLoadBalancer))
}
val localClient: ManhattanKVClient = mkClient(config.localDestination, config.thriftClientLabel)
val localMhEndpoint: ManhattanKVEndpoint = ManhattanKVEndpointBuilder(localClient)
.defaultGuarantee(Guarantee.SoftDcReadMyWrites)
.defaultMaxTimeout(config.localTimeout)
.maxRequestsPerBatch(config.maxRequestsPerBatch)
.build()
val localManhattanOperations = new ManhattanOperations(Config.Dataset, localMhEndpoint)
val remoteClient: ManhattanKVClient =
mkClient(config.remoteDestination, s"${config.thriftClientLabel}_remote")
val remoteMhEndpoint: ManhattanKVEndpoint = ManhattanKVEndpointBuilder(remoteClient)
.defaultGuarantee(Guarantee.SoftDcReadMyWrites)
.defaultMaxTimeout(config.remoteTimeout)
.build()
val remoteManhattanOperations = new ManhattanOperations(Config.Dataset, remoteMhEndpoint)
/**
* Note: This translation is only useful for non-batch endpoints. Batch endpoints currently
* represent failure without propagating an exception
* (e.g. [[com.twitter.tweetypie.storage.Response.TweetResponseCode.Failure]]).
*/
private[this] def translateExceptions(
apiName: String,
statsReceiver: StatsReceiver
): PartialFunction[Throwable, Throwable] = {
case e: IllegalArgumentException => ClientError(e.getMessage, e)
case e: DeniedManhattanException => RateLimited(e.getMessage, e)
case e: VersionMismatchError =>
statsReceiver.scope(apiName).counter("mh_version_mismatches").incr()
e
case e: InternalError =>
TweetUtils.log.error(e, s"Error processing $apiName request: ${e.getMessage}")
e
}
/**
* Count requests per client id producing metrics of the form
* .../clients/:root_client_id/requests
*/
def observeClientId[A, B](
apiStitch: A => Stitch[B],
statsReceiver: StatsReceiver,
clientIdHelper: ClientIdHelper,
): A => Stitch[B] = {
val clients = statsReceiver.scope("clients")
val incrementClientRequests = { args: A =>
val clientId = clientIdHelper.effectiveClientIdRoot.getOrElse(ClientIdHelper.UnknownClientId)
clients.counter(clientId, "requests").incr
}
a => {
incrementClientRequests(a)
apiStitch(a)
}
}
/**
* Increment counters based on the overall response status of the returned [[GetTweet.Response]].
*/
def observeGetTweetResponseCode[A](
apiStitch: A => Stitch[GetTweet.Response],
statsReceiver: StatsReceiver
): A => Stitch[GetTweet.Response] = {
val scope = statsReceiver.scope("response_code")
val success = scope.counter("success")
val notFound = scope.counter("not_found")
val failure = scope.counter("failure")
val overCapacity = scope.counter("over_capacity")
val deleted = scope.counter("deleted")
val bounceDeleted = scope.counter("bounce_deleted")
a =>
apiStitch(a).respond {
case Return(_: GetTweet.Response.Found) => success.incr()
case Return(GetTweet.Response.NotFound) => notFound.incr()
case Return(_: GetTweet.Response.BounceDeleted) => bounceDeleted.incr()
case Return(GetTweet.Response.Deleted) => deleted.incr()
case Throw(_: RateLimited) => overCapacity.incr()
case Throw(_) => failure.incr()
}
}
/**
* We do 3 things here:
*
* - Bookkeeping for overall requests
* - Bookkeeping for per api requests
* - Translate exceptions
*
* @param apiName the API being called
* @param apiStitch the implementation of the API
* @tparam A template for input type of API
* @tparam B template for output type of API
* @return Function which executes the given API call
*/
private[storage] def endpoint[A, B](
apiName: String,
apiStitch: A => Stitch[B]
): A => Stitch[B] = {
val translateException = translateExceptions(apiName, statsReceiver)
val observe = StitchUtils.observe[B](statsReceiver, apiName)
a =>
StitchUtils.translateExceptions(
observe(apiStitch(a)),
translateException
)
}
private[storage] def endpoint2[A, B, C](
apiName: String,
apiStitch: (A, B) => Stitch[C],
clientIdHelper: ClientIdHelper,
): (A, B) => Stitch[C] =
Function.untupled(endpoint(apiName, apiStitch.tupled))
val getTweet: TweetStorageClient.GetTweet = {
val stats = statsReceiver.scope("getTweet")
observeClientId(
observeGetTweetResponseCode(
endpoint(
"getTweet",
GetTweetHandler(
read = localManhattanOperations.read,
statsReceiver = stats,
)
),
stats,
),
stats,
clientIdHelper,
)
}
val getStoredTweet: TweetStorageClient.GetStoredTweet = {
val stats = statsReceiver.scope("getStoredTweet")
observeClientId(
endpoint(
"getStoredTweet",
GetStoredTweetHandler(
read = localManhattanOperations.read,
statsReceiver = stats,
)
),
stats,
clientIdHelper,
)
}
val addTweet: TweetStorageClient.AddTweet =
endpoint(
"addTweet",
AddTweetHandler(
insert = localManhattanOperations.insert,
scribe = scribe,
stats = statsReceiver
)
)
val updateTweet: TweetStorageClient.UpdateTweet =
endpoint2(
"updateTweet",
ManhattanTweetStorageClient.sanitizeTweetFields(
UpdateTweetHandler(
insert = localManhattanOperations.insert,
stats = statsReceiver,
)
),
clientIdHelper,
)
val softDelete: TweetStorageClient.SoftDelete =
endpoint(
"softDelete",
SoftDeleteHandler(
insert = localManhattanOperations.insert,
scribe = scribe
)
)
val bounceDelete: BounceDelete =
endpoint(
"bounceDelete",
BounceDeleteHandler(
insert = localManhattanOperations.insert,
scribe = scribe
)
)
val undelete: TweetStorageClient.Undelete =
endpoint(
"undelete",
UndeleteHandler(
read = localManhattanOperations.read,
localInsert = localManhattanOperations.insert,
remoteInsert = remoteManhattanOperations.insert,
delete = localManhattanOperations.delete,
undeleteWindowHours = config.undeleteWindowHours,
stats = statsReceiver
)
)
val getDeletedTweets: TweetStorageClient.GetDeletedTweets =
endpoint(
"getDeletedTweets",
GetDeletedTweetsHandler(
read = localManhattanOperations.read,
stats = statsReceiver
)
)
val deleteAdditionalFields: TweetStorageClient.DeleteAdditionalFields =
endpoint2(
"deleteAdditionalFields",
DeleteAdditionalFieldsHandler(
delete = localManhattanOperations.delete,
stats = statsReceiver,
),
clientIdHelper,
)
val scrub: TweetStorageClient.Scrub =
endpoint2(
"scrub",
ScrubHandler(
insert = localManhattanOperations.insert,
delete = localManhattanOperations.delete,
scribe = scribe,
stats = statsReceiver,
),
clientIdHelper,
)
val hardDeleteTweet: HardDeleteTweet =
endpoint(
"hardDeleteTweet",
HardDeleteTweetHandler(
read = localManhattanOperations.read,
insert = localManhattanOperations.insert,
delete = localManhattanOperations.delete,
scribe = scribe,
stats = statsReceiver
)
)
val ping: TweetStorageClient.Ping =
() =>
Stitch
.run(
localMhEndpoint
.get(
ManhattanOperations.KeyDescriptor
.withDataset(Config.Dataset)
.withPkey(Random.nextLong().abs)
.withLkey(TweetKey.LKey.CoreFieldsKey), // could be any lkey
ValueDescriptor(BufInjection)
).unit
)
}

View file

@ -0,0 +1,30 @@
package com.twitter.tweetypie.storage
object Response {
case class TweetResponse(
tweetId: Long,
overallResponse: TweetResponseCode,
additionalFieldResponses: Option[Map[Short, FieldResponse]] = None)
sealed trait TweetResponseCode
object TweetResponseCode {
object Success extends TweetResponseCode
object Partial extends TweetResponseCode
object Failure extends TweetResponseCode
object OverCapacity extends TweetResponseCode
object Deleted extends TweetResponseCode
}
case class FieldResponse(code: FieldResponseCode, message: Option[String] = None)
sealed trait FieldResponseCode
object FieldResponseCode {
object Success extends FieldResponseCode
object InvalidRequest extends FieldResponseCode
object ValueNotFound extends FieldResponseCode
object Timeout extends FieldResponseCode
object Error extends FieldResponseCode
}
}

View file

@ -0,0 +1,85 @@
package com.twitter.tweetypie.storage
import com.twitter.servo.util.FutureEffect
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.logging._
import com.twitter.scrooge.BinaryThriftStructSerializer
import com.twitter.servo.util.{Scribe => ServoScribe}
import com.twitter.tweetypie.storage_internal.thriftscala._
import com.twitter.tbird.thriftscala.Added
import com.twitter.tbird.thriftscala.Removed
import com.twitter.tbird.thriftscala.Scrubbed
import com.twitter.util.Time
/**
* Scribe is used to log tweet writes which are used to generate /tables/statuses in HDFS.
*
* Write Scribe Category Message
* ----- --------------- -------
* add tbird_add_status [[com.twitter.tbird.thriftscala.Added]]
* remove tbird_remove_status [[com.twitter.tbird.thriftscala.Removed]]
* scrub tbird_scrub_status [[com.twitter.tbird.thriftscala.Scrubbed]]
*
* The thrift representation is encoded using binary thrift protocol format, followed by base64
* encoding and converted to string using default character set (utf8). The logger uses BareFormatter.
*
* The thrift ops are scribed only after the write API call has succeeded.
*
* The class is thread safe except initial configuration and registration routines,
* and no exception is expected unless java heap is out of memory.
*
* If exception does get thrown, add/remove/scrub operations will fail and
* client will have to retry
*/
class Scribe(factory: Scribe.ScribeHandlerFactory, statsReceiver: StatsReceiver) {
import Scribe._
private val AddedSerializer = BinaryThriftStructSerializer(Added)
private val RemovedSerializer = BinaryThriftStructSerializer(Removed)
private val ScrubbedSerializer = BinaryThriftStructSerializer(Scrubbed)
private val addCounter = statsReceiver.counter("scribe/add/count")
private val removeCounter = statsReceiver.counter("scribe/remove/count")
private val scrubCounter = statsReceiver.counter("scribe/scrub/count")
val addHandler: FutureEffect[String] = ServoScribe(factory(scribeAddedCategory)())
val removeHandler: FutureEffect[String] = ServoScribe(factory(scribeRemovedCategory)())
val scrubHandler: FutureEffect[String] = ServoScribe(factory(scribeScrubbedCategory)())
private def addedToString(tweet: StoredTweet): String =
AddedSerializer.toString(
Added(StatusConversions.toTBirdStatus(tweet), Time.now.inMilliseconds, Some(false))
)
private def removedToString(id: Long, at: Time, isSoftDeleted: Boolean): String =
RemovedSerializer.toString(Removed(id, at.inMilliseconds, Some(isSoftDeleted)))
private def scrubbedToString(id: Long, cols: Seq[Int], at: Time): String =
ScrubbedSerializer.toString(Scrubbed(id, cols, at.inMilliseconds))
def logAdded(tweet: StoredTweet): Unit = {
addHandler(addedToString(tweet))
addCounter.incr()
}
def logRemoved(id: Long, at: Time, isSoftDeleted: Boolean): Unit = {
removeHandler(removedToString(id, at, isSoftDeleted))
removeCounter.incr()
}
def logScrubbed(id: Long, cols: Seq[Int], at: Time): Unit = {
scrubHandler(scrubbedToString(id, cols, at))
scrubCounter.incr()
}
}
object Scribe {
type ScribeHandlerFactory = (String) => HandlerFactory
/** WARNING: These categories are white-listed. If you are changing them, the new categories should be white-listed.
* You should followup with CoreWorkflows team (CW) for that.
*/
private val scribeAddedCategory = "tbird_add_status"
private val scribeRemovedCategory = "tbird_remove_status"
private val scribeScrubbedCategory = "tbird_scrub_status"
}

View file

@ -0,0 +1,71 @@
package com.twitter.tweetypie.storage
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.stitch.Stitch
import com.twitter.storage.client.manhattan.kv.ManhattanValue
import com.twitter.tweetypie.storage.TweetUtils._
import com.twitter.util.Time
/**
* Deletes data for the scrubbed field and writes a metadata record.
* Provides scrub functionality. Right now, we only allow the scrubbing of the geo field.
* It should be simple to add more fields to the allowlist if needed.
*/
object ScrubHandler {
val scrubFieldsAllowlist: Set[Field] = Set(Field.Geo)
def apply(
insert: ManhattanOperations.Insert,
delete: ManhattanOperations.Delete,
scribe: Scribe,
stats: StatsReceiver
): TweetStorageClient.Scrub =
(unfilteredTweetIds: Seq[TweetId], columns: Seq[Field]) => {
val tweetIds = unfilteredTweetIds.filter(_ > 0)
require(columns.nonEmpty, "Must specify fields to scrub")
require(
columns.toSet.size == columns.size,
s"Duplicate fields to scrub specified: $columns"
)
require(
columns.forall(scrubFieldsAllowlist.contains(_)),
s"Cannot scrub $columns; scrubbable fields are restricted to $scrubFieldsAllowlist"
)
Stats.addWidthStat("scrub", "ids", tweetIds.size, stats)
val mhTimestamp = Time.now
val stitches = tweetIds.map { tweetId =>
val deletionStitches = columns.map { field =>
val mhKeyToDelete = TweetKey.fieldKey(tweetId, field.id)
delete(mhKeyToDelete, Some(mhTimestamp)).liftToTry
}
val collectedStitch =
Stitch.collect(deletionStitches).map(collectWithRateLimitCheck).lowerFromTry
collectedStitch
.flatMap { _ =>
val scrubbedStitches = columns.map { column =>
val scrubbedKey = TweetKey.scrubbedFieldKey(tweetId, column.id)
val record =
TweetManhattanRecord(
scrubbedKey,
ManhattanValue(StringCodec.toByteBuffer(""), Some(mhTimestamp))
)
insert(record).liftToTry
}
Stitch.collect(scrubbedStitches)
}
.map(collectWithRateLimitCheck)
}
Stitch.collect(stitches).map(collectWithRateLimitCheck).lowerFromTry.onSuccess { _ =>
tweetIds.foreach { id => scribe.logScrubbed(id, columns.map(_.id.toInt), mhTimestamp) }
}
}
}

View file

@ -0,0 +1,20 @@
package com.twitter.tweetypie.storage
import com.twitter.util.Time
object SoftDeleteHandler {
def apply(
insert: ManhattanOperations.Insert,
scribe: Scribe
): TweetStorageClient.SoftDelete =
tweetId => {
val mhTimestamp = Time.now
val softDeleteRecord = TweetStateRecord
.SoftDeleted(tweetId, mhTimestamp.inMillis)
.toTweetMhRecord
insert(softDeleteRecord).onSuccess { _ =>
scribe.logRemoved(tweetId, mhTimestamp, isSoftDeleted = true)
}
}
}

View file

@ -0,0 +1,33 @@
package com.twitter.tweetypie.storage
import com.twitter.finagle.stats.StatsReceiver
object Stats {
// These two methods below (addWidthStat and updatePerFieldQpsCounters) are called per RPC call for most APIs,
// so we rely on the stats receiver that is passed in to the library to do memoization.
private[storage] def addWidthStat(
rpcName: String,
paramName: String,
width: Int,
stats: StatsReceiver
): Unit =
getStat(rpcName, paramName, stats).add(width)
// Updates the counters for each Additional field. The idea here is to expose the QPS for each
// additional field
private[storage] def updatePerFieldQpsCounters(
rpcName: String,
fieldIds: Seq[FieldId],
count: Int,
stats: StatsReceiver
): Unit = {
fieldIds.foreach { fieldId => getCounter(rpcName, fieldId, stats).incr(count) }
}
private def getCounter(rpcName: String, fieldId: FieldId, stats: StatsReceiver) =
stats.scope(rpcName, "fields", fieldId.toString).counter("count")
private def getStat(rpcName: String, paramName: String, stats: StatsReceiver) =
stats.scope(rpcName, paramName).stat("width")
}

View file

@ -0,0 +1,129 @@
package com.twitter.tweetypie.storage
import com.twitter.tweetypie.storage_internal.thriftscala._
import com.twitter.tbird.{thriftscala => tbird}
object StatusConversions {
/**
* This is used only in Scribe.scala, when scribing to tbird_add_status
* Once we remove that, we can also remove this.
*/
def toTBirdStatus(tweet: StoredTweet): tbird.Status =
tbird.Status(
id = tweet.id,
userId = tweet.userId.get,
text = tweet.text.get,
createdVia = tweet.createdVia.get,
createdAtSec = tweet.createdAtSec.get,
reply = tweet.reply.map(toTBirdReply),
share = tweet.share.map(toTBirdShare),
contributorId = tweet.contributorId,
geo = tweet.geo.map(toTBirdGeo),
hasTakedown = tweet.hasTakedown.getOrElse(false),
nsfwUser = tweet.nsfwUser.getOrElse(false),
nsfwAdmin = tweet.nsfwAdmin.getOrElse(false),
media = tweet.media.map(_.map(toTBirdMedia)).getOrElse(Seq()),
narrowcast = tweet.narrowcast.map(toTBirdNarrowcast),
nullcast = tweet.nullcast.getOrElse(false),
trackingId = tweet.trackingId
)
/**
* This is only used in a test, to verify that the above method `toTBirdStatus`
* works, so we can't remove it as long as the above method exists.
*/
def fromTBirdStatus(status: tbird.Status): StoredTweet = {
StoredTweet(
id = status.id,
userId = Some(status.userId),
text = Some(status.text),
createdVia = Some(status.createdVia),
createdAtSec = Some(status.createdAtSec),
reply = status.reply.map(fromTBirdReply),
share = status.share.map(fromTBirdShare),
contributorId = status.contributorId,
geo = status.geo.map(fromTBirdGeo),
hasTakedown = Some(status.hasTakedown),
nsfwUser = Some(status.nsfwUser),
nsfwAdmin = Some(status.nsfwAdmin),
media = Some(status.media.map(fromTBirdMedia)),
narrowcast = status.narrowcast.map(fromTBirdNarrowcast),
nullcast = Some(status.nullcast),
trackingId = status.trackingId
)
}
private def fromTBirdReply(reply: tbird.Reply): StoredReply =
StoredReply(
inReplyToStatusId = reply.inReplyToStatusId,
inReplyToUserId = reply.inReplyToUserId
)
private def fromTBirdShare(share: tbird.Share): StoredShare =
StoredShare(
sourceStatusId = share.sourceStatusId,
sourceUserId = share.sourceUserId,
parentStatusId = share.parentStatusId
)
private def fromTBirdGeo(geo: tbird.Geo): StoredGeo =
StoredGeo(
latitude = geo.latitude,
longitude = geo.longitude,
geoPrecision = geo.geoPrecision,
entityId = geo.entityId
)
private def fromTBirdMedia(media: tbird.MediaEntity): StoredMediaEntity =
StoredMediaEntity(
id = media.id,
mediaType = media.mediaType,
width = media.width,
height = media.height
)
private def fromTBirdNarrowcast(narrowcast: tbird.Narrowcast): StoredNarrowcast =
StoredNarrowcast(
language = Some(narrowcast.language),
location = Some(narrowcast.location),
ids = Some(narrowcast.ids)
)
private def toTBirdReply(reply: StoredReply): tbird.Reply =
tbird.Reply(
inReplyToStatusId = reply.inReplyToStatusId,
inReplyToUserId = reply.inReplyToUserId
)
private def toTBirdShare(share: StoredShare): tbird.Share =
tbird.Share(
sourceStatusId = share.sourceStatusId,
sourceUserId = share.sourceUserId,
parentStatusId = share.parentStatusId
)
private def toTBirdGeo(geo: StoredGeo): tbird.Geo =
tbird.Geo(
latitude = geo.latitude,
longitude = geo.longitude,
geoPrecision = geo.geoPrecision,
entityId = geo.entityId,
name = geo.name
)
private def toTBirdMedia(media: StoredMediaEntity): tbird.MediaEntity =
tbird.MediaEntity(
id = media.id,
mediaType = media.mediaType,
width = media.width,
height = media.height
)
private def toTBirdNarrowcast(narrowcast: StoredNarrowcast): tbird.Narrowcast =
tbird.Narrowcast(
language = narrowcast.language.getOrElse(Nil),
location = narrowcast.location.getOrElse(Nil),
ids = narrowcast.ids.getOrElse(Nil)
)
}

View file

@ -0,0 +1,346 @@
package com.twitter.tweetypie.storage
import com.twitter.mediaservices.commons.tweetmedia.thriftscala._
import com.twitter.scrooge.TFieldBlob
import com.twitter.tweetypie.additionalfields.AdditionalFields
import com.twitter.tweetypie.storage_internal.thriftscala._
import com.twitter.tweetypie.thriftscala._
import com.twitter.tweetypie.util.TweetLenses
object StorageConversions {
private val tbTweetCompiledAdditionalFieldIds =
StoredTweet.metaData.fields.map(_.id).filter(AdditionalFields.isAdditionalFieldId)
def toStoredReply(reply: Reply, conversationId: Option[TweetId]): StoredReply =
StoredReply(
inReplyToStatusId = reply.inReplyToStatusId.getOrElse(0),
inReplyToUserId = reply.inReplyToUserId,
conversationId = conversationId
)
def toStoredShare(share: Share): StoredShare =
StoredShare(
share.sourceStatusId,
share.sourceUserId,
share.parentStatusId
)
def toStoredQuotedTweet(qt: QuotedTweet, text: String): Option[StoredQuotedTweet] =
qt.permalink
.filterNot { p =>
text.contains(p.shortUrl)
} // omit StoredQuotedTweet when url already in text
.map { p =>
StoredQuotedTweet(
qt.tweetId,
qt.userId,
p.shortUrl
)
}
def toStoredGeo(tweet: Tweet): Option[StoredGeo] =
TweetLenses.geoCoordinates.get(tweet) match {
case None =>
TweetLenses.placeId.get(tweet) match {
case None => None
case Some(placeId) =>
Some(
StoredGeo(
latitude = 0.0,
longitude = 0.0,
geoPrecision = 0,
entityId = 0,
name = Some(placeId)
)
)
}
case Some(coords) =>
Some(
StoredGeo(
latitude = coords.latitude,
longitude = coords.longitude,
geoPrecision = coords.geoPrecision,
entityId = if (coords.display) 2 else 0,
name = TweetLenses.placeId.get(tweet)
)
)
}
def toStoredMedia(mediaList: Seq[MediaEntity]): Seq[StoredMediaEntity] =
mediaList.filter(_.sourceStatusId.isEmpty).flatMap(toStoredMediaEntity)
def toStoredMediaEntity(media: MediaEntity): Option[StoredMediaEntity] =
media.sizes.find(_.sizeType == MediaSizeType.Orig).map { origSize =>
StoredMediaEntity(
id = media.mediaId,
mediaType = origSize.deprecatedContentType.value.toByte,
width = origSize.width.toShort,
height = origSize.height.toShort
)
}
// The language and ids fields are for compatibility with existing tweets stored in manhattan.
def toStoredNarrowcast(narrowcast: Narrowcast): StoredNarrowcast =
StoredNarrowcast(
language = Some(Seq.empty),
location = Some(narrowcast.location),
ids = Some(Seq.empty)
)
def toStoredAdditionalFields(from: Seq[TFieldBlob], to: StoredTweet): StoredTweet =
from.foldLeft(to) { case (t, f) => t.setField(f) }
def toStoredAdditionalFields(from: Tweet, to: StoredTweet): StoredTweet =
toStoredAdditionalFields(AdditionalFields.additionalFields(from), to)
def toStoredTweet(tweet: Tweet): StoredTweet = {
val storedTweet =
StoredTweet(
id = tweet.id,
userId = Some(TweetLenses.userId(tweet)),
text = Some(TweetLenses.text(tweet)),
createdVia = Some(TweetLenses.createdVia(tweet)),
createdAtSec = Some(TweetLenses.createdAt(tweet)),
reply =
TweetLenses.reply(tweet).map { r => toStoredReply(r, TweetLenses.conversationId(tweet)) },
share = TweetLenses.share(tweet).map(toStoredShare),
contributorId = tweet.contributor.map(_.userId),
geo = toStoredGeo(tweet),
hasTakedown = Some(TweetLenses.hasTakedown(tweet)),
nsfwUser = Some(TweetLenses.nsfwUser(tweet)),
nsfwAdmin = Some(TweetLenses.nsfwAdmin(tweet)),
media = tweet.media.map(toStoredMedia),
narrowcast = TweetLenses.narrowcast(tweet).map(toStoredNarrowcast),
nullcast = Some(TweetLenses.nullcast(tweet)),
trackingId = TweetLenses.trackingId(tweet),
quotedTweet = TweetLenses.quotedTweet(tweet).flatMap { qt =>
toStoredQuotedTweet(qt, TweetLenses.text(tweet))
}
)
toStoredAdditionalFields(tweet, storedTweet)
}
/**
* Does not need core data to be set. Constructs on disk tweet by avoiding the TweetLenses object
* and only extracting the specified fields.
*
* NOTE: Assumes that specified fields are set in the tweet.
*
* @param tpTweet Tweetypie Tweet to be converted
* @param fields the fields to be populated in the on disk Tweet
*
* @return an on disk Tweet which has only the specified fields set
*/
def toStoredTweetForFields(tpTweet: Tweet, fields: Set[Field]): StoredTweet = {
// Make sure all the passed in fields are known or additional fields
require(
(fields -- Field.AllUpdatableCompiledFields)
.forall(field => AdditionalFields.isAdditionalFieldId(field.id))
)
val storedTweet =
StoredTweet(
id = tpTweet.id,
geo = if (fields.contains(Field.Geo)) {
tpTweet.coreData.get.coordinates match {
case None =>
tpTweet.coreData.get.placeId match {
case None => None
case Some(placeId) =>
Some(
StoredGeo(
latitude = 0.0,
longitude = 0.0,
geoPrecision = 0,
entityId = 0,
name = Some(placeId)
)
)
}
case Some(coords) =>
Some(
StoredGeo(
latitude = coords.latitude,
longitude = coords.longitude,
geoPrecision = coords.geoPrecision,
entityId = if (coords.display) 2 else 0,
name = tpTweet.coreData.get.placeId
)
)
}
} else {
None
},
hasTakedown =
if (fields.contains(Field.HasTakedown))
Some(tpTweet.coreData.get.hasTakedown)
else
None,
nsfwUser =
if (fields.contains(Field.NsfwUser))
Some(tpTweet.coreData.get.nsfwUser)
else
None,
nsfwAdmin =
if (fields.contains(Field.NsfwAdmin))
Some(tpTweet.coreData.get.nsfwAdmin)
else
None
)
if (fields.map(_.id).exists(AdditionalFields.isAdditionalFieldId))
toStoredAdditionalFields(tpTweet, storedTweet)
else
storedTweet
}
def fromStoredReply(reply: StoredReply): Reply =
Reply(
Some(reply.inReplyToStatusId).filter(_ > 0),
reply.inReplyToUserId
)
def fromStoredShare(share: StoredShare): Share =
Share(
share.sourceStatusId,
share.sourceUserId,
share.parentStatusId
)
def fromStoredQuotedTweet(qt: StoredQuotedTweet): QuotedTweet =
QuotedTweet(
qt.tweetId,
qt.userId,
Some(
ShortenedUrl(
shortUrl = qt.shortUrl,
longUrl = "", // will be hydrated later via tweetypie's QuotedTweetRefUrlsHydrator
displayText = "" //will be hydrated later via tweetypie's QuotedTweetRefUrlsHydrator
)
)
)
def fromStoredGeo(geo: StoredGeo): GeoCoordinates =
GeoCoordinates(
latitude = geo.latitude,
longitude = geo.longitude,
geoPrecision = geo.geoPrecision,
display = geo.entityId == 2
)
def fromStoredMediaEntity(media: StoredMediaEntity): MediaEntity =
MediaEntity(
fromIndex = -1, // will get filled in later
toIndex = -1, // will get filled in later
url = null, // will get filled in later
mediaPath = "", // field is obsolete
mediaUrl = null, // will get filled in later
mediaUrlHttps = null, // will get filled in later
displayUrl = null, // will get filled in later
expandedUrl = null, // will get filled in later
mediaId = media.id,
nsfw = false,
sizes = Set(
MediaSize(
sizeType = MediaSizeType.Orig,
resizeMethod = MediaResizeMethod.Fit,
deprecatedContentType = MediaContentType(media.mediaType),
width = media.width,
height = media.height
)
)
)
def fromStoredNarrowcast(narrowcast: StoredNarrowcast): Narrowcast =
Narrowcast(
location = narrowcast.location.getOrElse(Seq())
)
def fromStoredTweet(storedTweet: StoredTweet): Tweet = {
val coreData =
TweetCoreData(
userId = storedTweet.userId.get,
text = storedTweet.text.get,
createdVia = storedTweet.createdVia.get,
createdAtSecs = storedTweet.createdAtSec.get,
reply = storedTweet.reply.map(fromStoredReply),
share = storedTweet.share.map(fromStoredShare),
hasTakedown = storedTweet.hasTakedown.getOrElse(false),
nsfwUser = storedTweet.nsfwUser.getOrElse(false),
nsfwAdmin = storedTweet.nsfwAdmin.getOrElse(false),
narrowcast = storedTweet.narrowcast.map(fromStoredNarrowcast),
nullcast = storedTweet.nullcast.getOrElse(false),
trackingId = storedTweet.trackingId,
conversationId = storedTweet.reply.flatMap(_.conversationId),
placeId = storedTweet.geo.flatMap(_.name),
coordinates = storedTweet.geo.map(fromStoredGeo),
hasMedia = if (storedTweet.media.exists(_.nonEmpty)) Some(true) else None
)
// retweets should never have their media, but some tweets incorrectly do.
val storedMedia = if (coreData.share.isDefined) Nil else storedTweet.media.toSeq
val tpTweet =
Tweet(
id = storedTweet.id,
coreData = Some(coreData),
contributor = storedTweet.contributorId.map(Contributor(_)),
media = Some(storedMedia.flatten.map(fromStoredMediaEntity)),
mentions = Some(Seq.empty),
urls = Some(Seq.empty),
cashtags = Some(Seq.empty),
hashtags = Some(Seq.empty),
quotedTweet = storedTweet.quotedTweet.map(fromStoredQuotedTweet)
)
fromStoredAdditionalFields(storedTweet, tpTweet)
}
def fromStoredTweetAllowInvalid(storedTweet: StoredTweet): Tweet = {
fromStoredTweet(
storedTweet.copy(
userId = storedTweet.userId.orElse(Some(-1L)),
text = storedTweet.text.orElse(Some("")),
createdVia = storedTweet.createdVia.orElse(Some("")),
createdAtSec = storedTweet.createdAtSec.orElse(Some(-1L))
))
}
def fromStoredAdditionalFields(from: StoredTweet, to: Tweet): Tweet = {
val passThroughAdditionalFields =
from._passthroughFields.filterKeys(AdditionalFields.isAdditionalFieldId)
val allAdditionalFields =
from.getFieldBlobs(tbTweetCompiledAdditionalFieldIds) ++ passThroughAdditionalFields
allAdditionalFields.values.foldLeft(to) { case (t, f) => t.setField(f) }
}
def toDeletedTweet(storedTweet: StoredTweet): DeletedTweet = {
val noteTweetBlob = storedTweet.getFieldBlob(Tweet.NoteTweetField.id)
val noteTweetOption = noteTweetBlob.map(blob => NoteTweet.decode(blob.read))
DeletedTweet(
id = storedTweet.id,
userId = storedTweet.userId,
text = storedTweet.text,
createdAtSecs = storedTweet.createdAtSec,
share = storedTweet.share.map(toDeletedShare),
media = storedTweet.media.map(_.map(toDeletedMediaEntity)),
noteTweetId = noteTweetOption.map(_.id),
isExpandable = noteTweetOption.flatMap(_.isExpandable)
)
}
def toDeletedShare(storedShare: StoredShare): DeletedTweetShare =
DeletedTweetShare(
sourceStatusId = storedShare.sourceStatusId,
sourceUserId = storedShare.sourceUserId,
parentStatusId = storedShare.parentStatusId
)
def toDeletedMediaEntity(storedMediaEntity: StoredMediaEntity): DeletedTweetMediaEntity =
DeletedTweetMediaEntity(
id = storedMediaEntity.id,
mediaType = storedMediaEntity.mediaType,
width = storedMediaEntity.width,
height = storedMediaEntity.height
)
}

View file

@ -0,0 +1,92 @@
package com.twitter.tweetypie.storage
import com.twitter.util.Return
import com.twitter.util.Throw
import com.twitter.util.Time
import com.twitter.util.Try
import java.util.Arrays
import scala.util.control.NoStackTrace
import scala.util.control.NonFatal
sealed abstract class TimestampType(val keyName: String)
object TimestampType {
object Default extends TimestampType("timestamp")
object SoftDelete extends TimestampType("softdelete_timestamp")
}
/**
* TimestampDecoder gets the timestamps associated with state records. The Manhattan timestamp is
* used for legacy records (with value "1"), otherwise the timestamp is extracted from the
* JSON value.
*
* See "Metadata" in README.md for further information about state records.
*/
object TimestampDecoder {
case class UnparsableJson(msg: String, t: Throwable) extends Exception(msg, t) with NoStackTrace
case class MissingJsonTimestamp(msg: String) extends Exception(msg) with NoStackTrace
case class UnexpectedJsonValue(msg: String) extends Exception(msg) with NoStackTrace
case class MissingManhattanTimestamp(msg: String) extends Exception(msg) with NoStackTrace
private[storage] val LegacyValue: Array[Byte] = Array('1')
/**
* The first backfill of tweet data to Manhattan supplied timestamps in milliseconds where
* nanoseconds were expected. The result is that some values have an incorrect Manhattan
* timestamp. For these bad timestamps, time.inNanoseconds is actually milliseconds.
*
* For example, the deletion record for tweet 22225781 has Manhattan timestamp 1970-01-01 00:23:24 +0000.
* Contrast with the deletion record for tweet 435404491999813632 with Manhattan timestamp 2014-11-09 14:24:04 +0000
*
* This threshold value comes from the last time in milliseconds that was interpreted
* as nanoseconds, e.g. Time.fromNanoseconds(1438387200000L) == 1970-01-01 00:23:58 +0000
*/
private[storage] val BadTimestampThreshold = Time.at("1970-01-01 00:23:58 +0000")
def decode(record: TweetManhattanRecord, tsType: TimestampType): Try[Long] =
decode(record.value, tsType)
def decode(mhValue: TweetManhattanValue, tsType: TimestampType): Try[Long] = {
val value = ByteArrayCodec.fromByteBuffer(mhValue.contents)
if (isLegacyRecord(value)) {
nativeManhattanTimestamp(mhValue)
} else {
jsonTimestamp(value, tsType)
}
}
private def isLegacyRecord(value: Array[Byte]) = Arrays.equals(value, LegacyValue)
private def nativeManhattanTimestamp(mhValue: TweetManhattanValue): Try[Long] =
mhValue.timestamp match {
case Some(ts) => Return(correctedTimestamp(ts))
case None =>
Throw(MissingManhattanTimestamp(s"Manhattan timestamp missing in value $mhValue"))
}
private def jsonTimestamp(value: Array[Byte], tsType: TimestampType): Try[Long] =
Try { Json.decode(value) }
.rescue { case NonFatal(e) => Throw(UnparsableJson(e.getMessage, e)) }
.flatMap { m =>
m.get(tsType.keyName) match {
case Some(v) =>
v match {
case l: Long => Return(l)
case i: Integer => Return(i.toLong)
case _ =>
Throw(
UnexpectedJsonValue(s"Unexpected value for ${tsType.keyName} in record data $m")
)
}
case None =>
Throw(MissingJsonTimestamp(s"Missing key ${tsType.keyName} in record data $m"))
}
}
def correctedTime(t: Time): Time =
if (t < BadTimestampThreshold) Time.fromMilliseconds(t.inNanoseconds) else t
def correctedTime(t: Long): Time = correctedTime(Time.fromNanoseconds(t))
def correctedTimestamp(t: Time): Long =
if (t < BadTimestampThreshold) t.inNanoseconds else t.inMilliseconds
}

View file

@ -0,0 +1,164 @@
package com.twitter.tweetypie.storage
/**
* Responsible for encoding/decoding Tweet records to/from Manhattan keys
*
* K/V Scheme:
* -----------
* [TweetId]
* /metadata
* /delete_state (a.k.a. hard delete)
* /soft_delete_state
* /bounce_delete_state
* /undelete_state
* /force_added_state
* /scrubbed_fields/
* /[ScrubbedFieldId_1]
* ..
* /[ScrubbedFieldId_M]
* /fields
* /internal
* /1
* /9
* ..
* /99
* /external
* /100
* ..
*
* IMPORTANT NOTE:
* 1) Field Ids 2 to 8 in Tweet thrift struct are considered "core fields" are 'packed' together
* into a TFieldBlob and stored under field id 1 (i.e [DatasetName]/[TweetId]/fields/internal/1).
* This is why we do not see keys from [DatasetName]/[TweetId]/fields/internal/2 to [DatasetName]/
* [TweetId]/fields/internal/8)
*
* 2) Also, the tweet id (which is the field id 1 in Tweet thrift structure) is not explicitly stored
* in Manhattan. There is no need to explicitly store it since it is a part of the Pkey
*/
case class TweetKey(tweetId: TweetId, lKey: TweetKey.LKey) {
override def toString: String =
s"/${ManhattanOperations.PkeyInjection(tweetId)}/${ManhattanOperations.LkeyInjection(lKey)}"
}
object TweetKey {
// Manhattan uses lexicographical order for keys. To make sure lexicographical order matches the
// numerical order, we should pad both tweet id and field ids with leading zeros.
// Since tweet id is long and field id is a short, the max width of each can be obtained by doing
// Long.MaxValue.toString.length and Short.MaxValue.toString.length respectively
private val TweetIdFormatStr = s"%0${Long.MaxValue.toString.length}d"
private val FieldIdFormatStr = s"%0${Short.MaxValue.toString.length}d"
private[storage] def padTweetIdStr(tweetId: Long): String = TweetIdFormatStr.format(tweetId)
private[storage] def padFieldIdStr(fieldId: Short): String = FieldIdFormatStr.format(fieldId)
def coreFieldsKey(tweetId: TweetId): TweetKey = TweetKey(tweetId, LKey.CoreFieldsKey)
def hardDeletionStateKey(tweetId: TweetId): TweetKey =
TweetKey(tweetId, LKey.HardDeletionStateKey)
def softDeletionStateKey(tweetId: TweetId): TweetKey =
TweetKey(tweetId, LKey.SoftDeletionStateKey)
def bounceDeletionStateKey(tweetId: TweetId): TweetKey =
TweetKey(tweetId, LKey.BounceDeletionStateKey)
def unDeletionStateKey(tweetId: TweetId): TweetKey = TweetKey(tweetId, LKey.UnDeletionStateKey)
def forceAddedStateKey(tweetId: TweetId): TweetKey = TweetKey(tweetId, LKey.ForceAddedStateKey)
def scrubbedGeoFieldKey(tweetId: TweetId): TweetKey = TweetKey(tweetId, LKey.ScrubbedGeoFieldKey)
def fieldKey(tweetId: TweetId, fieldId: FieldId): TweetKey =
TweetKey(tweetId, LKey.FieldKey(fieldId))
def internalFieldsKey(tweetId: TweetId, fieldId: FieldId): TweetKey =
TweetKey(tweetId, LKey.InternalFieldsKey(fieldId))
def additionalFieldsKey(tweetId: TweetId, fieldId: FieldId): TweetKey =
TweetKey(tweetId, LKey.AdditionalFieldsKey(fieldId))
def scrubbedFieldKey(tweetId: TweetId, fieldId: FieldId): TweetKey =
TweetKey(tweetId, LKey.ScrubbedFieldKey(fieldId))
// AllFieldsKeyPrefix: fields
// CoreFieldsKey: fields/internal/1 (Stores subset of StoredTweet fields which are
// "packed" into a single CoreFields record)
// HardDeletionStateKey: metadata/delete_state
// SoftDeletionStateKey: metadata/soft_delete_state
// BounceDeletionStateKey: metadata/bounce_delete_state
// UnDeletionStateKey: metadata/undelete_state
// ForceAddedStateKey: metadata/force_added_state
// FieldKey: fields/<group_name>/<padded_field_id> (where <group_name>
// is 'internal' for field ids < 100 and 'external' for all other
// fields ids)
// InternalFieldsKeyPrefix: fields/internal
// PKey: <empty string>
// ScrubbedFieldKey: metadata/scrubbed_fields/<padded_field_id>
// ScrubbedFieldKeyPrefix: metadata/scrubbed_fields
sealed abstract class LKey(override val toString: String)
object LKey {
private val HardDeletionRecordLiteral = "delete_state"
private val SoftDeletionRecordLiteral = "soft_delete_state"
private val BounceDeletionRecordLiteral = "bounce_delete_state"
private val UnDeletionRecordLiteral = "undelete_state"
private val ForceAddRecordLiteral = "force_added_state"
private val ScrubbedFieldsGroup = "scrubbed_fields"
private val InternalFieldsGroup = "internal"
private val ExternalFieldsGroup = "external"
private val MetadataCategory = "metadata"
private val FieldsCategory = "fields"
private val InternalFieldsKeyPrefix = s"$FieldsCategory/$InternalFieldsGroup/"
private val ExternalFieldsKeyPrefix = s"$FieldsCategory/$ExternalFieldsGroup/"
private val ScrubbedFieldsKeyPrefix = s"$MetadataCategory/$ScrubbedFieldsGroup/"
sealed abstract class MetadataKey(metadataType: String)
extends LKey(s"$MetadataCategory/$metadataType")
sealed abstract class StateKey(stateType: String) extends MetadataKey(stateType)
case object HardDeletionStateKey extends StateKey(s"$HardDeletionRecordLiteral")
case object SoftDeletionStateKey extends StateKey(s"$SoftDeletionRecordLiteral")
case object BounceDeletionStateKey extends StateKey(s"$BounceDeletionRecordLiteral")
case object UnDeletionStateKey extends StateKey(s"$UnDeletionRecordLiteral")
case object ForceAddedStateKey extends StateKey(s"$ForceAddRecordLiteral")
case class ScrubbedFieldKey(fieldId: FieldId)
extends MetadataKey(s"$ScrubbedFieldsGroup/${padFieldIdStr(fieldId)}")
val ScrubbedGeoFieldKey: LKey.ScrubbedFieldKey = ScrubbedFieldKey(TweetFields.geoFieldId)
/**
* LKey that has one of many possible fields id. This generalize over
* internal and additional fields key.
*/
sealed abstract class FieldKey(prefix: String) extends LKey(toString) {
def fieldId: FieldId
override val toString: String = prefix + padFieldIdStr(fieldId)
}
object FieldKey {
def apply(fieldId: FieldId): FieldKey =
fieldId match {
case id if id < TweetFields.firstAdditionalFieldId => InternalFieldsKey(fieldId)
case _ => AdditionalFieldsKey(fieldId)
}
}
case class InternalFieldsKey(fieldId: FieldId) extends FieldKey(InternalFieldsKeyPrefix) {
assert(fieldId < TweetFields.firstAdditionalFieldId)
}
case class AdditionalFieldsKey(fieldId: FieldId) extends FieldKey(ExternalFieldsKeyPrefix) {
assert(fieldId >= TweetFields.firstAdditionalFieldId)
}
val CoreFieldsKey: LKey.InternalFieldsKey = InternalFieldsKey(TweetFields.rootCoreFieldId)
case class Unknown private (str: String) extends LKey(str)
def fromString(str: String): LKey = {
def extractFieldId(prefix: String): FieldId =
str.slice(prefix.length, str.length).toShort
str match {
case CoreFieldsKey.toString => CoreFieldsKey
case HardDeletionStateKey.toString => HardDeletionStateKey
case SoftDeletionStateKey.toString => SoftDeletionStateKey
case BounceDeletionStateKey.toString => BounceDeletionStateKey
case UnDeletionStateKey.toString => UnDeletionStateKey
case ForceAddedStateKey.toString => ForceAddedStateKey
case ScrubbedGeoFieldKey.toString => ScrubbedGeoFieldKey
case _ if str.startsWith(InternalFieldsKeyPrefix) =>
InternalFieldsKey(extractFieldId(InternalFieldsKeyPrefix))
case _ if str.startsWith(ExternalFieldsKeyPrefix) =>
AdditionalFieldsKey(extractFieldId(ExternalFieldsKeyPrefix))
case _ if str.startsWith(ScrubbedFieldsKeyPrefix) =>
ScrubbedFieldKey(extractFieldId(ScrubbedFieldsKeyPrefix))
case _ => Unknown(str)
}
}
}
}

View file

@ -0,0 +1,90 @@
package com.twitter.tweetypie.storage
import com.twitter.storage.client.manhattan.kv.ManhattanValue
import com.twitter.util.Time
/**
* A [[TweetStateRecord]] represents an action taken on a tweet and can be used to determine a tweet's state.
*
* The state is determined by the record with the most recent timestamp. In the absence of any
* record a tweet is considered found, which is to say the tweet has not been through the
* deletion process.
*
* The [[TweetStateRecord]] type is determined by the lkey of a tweet manhattan record:
* metadata/delete_state -> HardDeleted
* metadata/soft_delete_state -> SoftDeleted
* metadata/undelete_state -> Undeleted
* metadata/force_added_state -> ForceAdded
*
* See the README in this directory for more details about the state of a tweet.
*/
sealed trait TweetStateRecord {
def tweetId: TweetId
def createdAt: Long
def stateKey: TweetKey.LKey.StateKey
def values: Map[String, Long] = Map("timestamp" -> createdAt)
def name: String
def toTweetMhRecord: TweetManhattanRecord = {
val valByteBuffer = ByteArrayCodec.toByteBuffer(Json.encode(values))
val value = ManhattanValue(valByteBuffer, Some(Time.fromMilliseconds(createdAt)))
TweetManhattanRecord(TweetKey(tweetId, stateKey), value)
}
}
object TweetStateRecord {
/** When a soft-deleted or bounce deleted tweet is ultimately hard-deleted by an offline job. */
case class HardDeleted(tweetId: TweetId, createdAt: Long, deletedAt: Long)
extends TweetStateRecord {
// timestamp in the mh backend is the hard deletion timestamp
override def values = Map("timestamp" -> createdAt, "softdelete_timestamp" -> deletedAt)
def stateKey = TweetKey.LKey.HardDeletionStateKey
def name = "hard_deleted"
}
/** When a tweet is deleted by the user. It can still be undeleted while in the soft deleted state. */
case class SoftDeleted(tweetId: TweetId, createdAt: Long) extends TweetStateRecord {
def stateKey = TweetKey.LKey.SoftDeletionStateKey
def name = "soft_deleted"
}
/** When a tweet is deleted by go/bouncer for violating Twitter Rules. It MAY NOT be undeleted. */
case class BounceDeleted(tweetId: TweetId, createdAt: Long) extends TweetStateRecord {
def stateKey = TweetKey.LKey.BounceDeletionStateKey
def name = "bounce_deleted"
}
/** When a tweet is undeleted by an internal system. */
case class Undeleted(tweetId: TweetId, createdAt: Long) extends TweetStateRecord {
def stateKey = TweetKey.LKey.UnDeletionStateKey
def name = "undeleted"
}
/** When a tweet is created using the forceAdd endpoint. */
case class ForceAdded(tweetId: TweetId, createdAt: Long) extends TweetStateRecord {
def stateKey = TweetKey.LKey.ForceAddedStateKey
def name = "force_added"
}
def fromTweetMhRecord(record: TweetManhattanRecord): Option[TweetStateRecord] = {
def ts = TimestampDecoder.decode(record, TimestampType.Default).getOrElse(0L)
def sdts = TimestampDecoder.decode(record, TimestampType.SoftDelete).getOrElse(0L)
def tweetId = record.pkey
record.lkey match {
case TweetKey.LKey.HardDeletionStateKey => Some(HardDeleted(tweetId, ts, sdts))
case TweetKey.LKey.SoftDeletionStateKey => Some(SoftDeleted(tweetId, ts))
case TweetKey.LKey.BounceDeletionStateKey => Some(BounceDeleted(tweetId, ts))
case TweetKey.LKey.UnDeletionStateKey => Some(Undeleted(tweetId, ts))
case TweetKey.LKey.ForceAddedStateKey => Some(ForceAdded(tweetId, ts))
case _ => None
}
}
def fromTweetMhRecords(records: Seq[TweetManhattanRecord]): Seq[TweetStateRecord] =
records.flatMap(fromTweetMhRecord)
def mostRecent(records: Seq[TweetManhattanRecord]): Option[TweetStateRecord] =
fromTweetMhRecords(records).sortBy(_.createdAt).lastOption
}

View file

@ -0,0 +1,201 @@
package com.twitter.tweetypie.storage
import com.twitter.stitch.Stitch
import com.twitter.tweetypie.storage.Response.TweetResponse
import com.twitter.tweetypie.thriftscala.Tweet
import com.twitter.util.Future
/**
* Interface for reading and writing tweet data in Manhattan
*/
trait TweetStorageClient {
import TweetStorageClient._
def addTweet: AddTweet
def deleteAdditionalFields: DeleteAdditionalFields
def getTweet: GetTweet
def getStoredTweet: GetStoredTweet
def getDeletedTweets: GetDeletedTweets
def undelete: Undelete
def updateTweet: UpdateTweet
def scrub: Scrub
def softDelete: SoftDelete
def bounceDelete: BounceDelete
def hardDeleteTweet: HardDeleteTweet
def ping: Ping
}
object TweetStorageClient {
type GetTweet = TweetId => Stitch[GetTweet.Response]
object GetTweet {
sealed trait Response
object Response {
case class Found(tweet: Tweet) extends Response
object NotFound extends Response
object Deleted extends Response
// On BounceDeleted, provide the full Tweet so that implementations
// (i.e. ManhattanTweetStorageClient) don't not need to be aware of the specific tweet
// fields required by callers for proper processing of bounced deleted tweets.
case class BounceDeleted(tweet: Tweet) extends Response
}
}
type GetStoredTweet = TweetId => Stitch[GetStoredTweet.Response]
object GetStoredTweet {
sealed abstract class Error(val message: String) {
override def toString: String = message
}
object Error {
case object TweetIsCorrupt extends Error("stored tweet data is corrupt and cannot be decoded")
case object ScrubbedFieldsPresent
extends Error("stored tweet fields that should be scrubbed are still present")
case object TweetFieldsMissingOrInvalid
extends Error("expected tweet fields are missing or contain invalid values")
case object TweetShouldBeHardDeleted
extends Error("stored tweet that should be hard deleted is still present")
}
sealed trait Response
object Response {
sealed trait StoredTweetMetadata {
def state: Option[TweetStateRecord]
def allStates: Seq[TweetStateRecord]
def scrubbedFields: Set[FieldId]
}
sealed trait StoredTweetErrors {
def errs: Seq[Error]
}
/**
* Tweet data was found, possibly state records and/or scrubbed field records.
*/
sealed trait FoundAny extends Response with StoredTweetMetadata {
def tweet: Tweet
}
object FoundAny {
def unapply(
response: Response
): Option[
(Tweet, Option[TweetStateRecord], Seq[TweetStateRecord], Set[FieldId], Seq[Error])
] =
response match {
case f: FoundWithErrors =>
Some((f.tweet, f.state, f.allStates, f.scrubbedFields, f.errs))
case f: FoundAny => Some((f.tweet, f.state, f.allStates, f.scrubbedFields, Seq.empty))
case _ => None
}
}
/**
* No records for this tweet id were found in storage
*/
case class NotFound(id: TweetId) extends Response
/**
* Data related to the Tweet id was found but could not be loaded successfully. The
* errs array contains details of the problems.
*/
case class Failed(
id: TweetId,
state: Option[TweetStateRecord],
allStates: Seq[TweetStateRecord],
scrubbedFields: Set[FieldId],
errs: Seq[Error],
) extends Response
with StoredTweetMetadata
with StoredTweetErrors
/**
* No Tweet data was found, and the most recent state record found is HardDeleted
*/
case class HardDeleted(
id: TweetId,
state: Option[TweetStateRecord.HardDeleted],
allStates: Seq[TweetStateRecord],
scrubbedFields: Set[FieldId],
) extends Response
with StoredTweetMetadata
/**
* Tweet data was found, and the most recent state record found, if any, is not
* any form of deletion record.
*/
case class Found(
tweet: Tweet,
state: Option[TweetStateRecord],
allStates: Seq[TweetStateRecord],
scrubbedFields: Set[FieldId],
) extends FoundAny
/**
* Tweet data was found, and the most recent state record found indicates deletion.
*/
case class FoundDeleted(
tweet: Tweet,
state: Option[TweetStateRecord],
allStates: Seq[TweetStateRecord],
scrubbedFields: Set[FieldId],
) extends FoundAny
/**
* Tweet data was found, however errors were detected in the stored data. Required
* fields may be missing from the Tweet struct (e.g. CoreData), stored fields that
* should be scrubbed remain present, or Tweets that should be hard-deleted remain
* in storage. The errs array contains details of the problems.
*/
case class FoundWithErrors(
tweet: Tweet,
state: Option[TweetStateRecord],
allStates: Seq[TweetStateRecord],
scrubbedFields: Set[FieldId],
errs: Seq[Error],
) extends FoundAny
with StoredTweetErrors
}
}
type HardDeleteTweet = TweetId => Stitch[HardDeleteTweet.Response]
type SoftDelete = TweetId => Stitch[Unit]
type BounceDelete = TweetId => Stitch[Unit]
object HardDeleteTweet {
sealed trait Response
object Response {
case class Deleted(deletedAtMillis: Option[Long], createdAtMillis: Option[Long])
extends Response
case class NotDeleted(id: TweetId, ineligibleLKey: Option[TweetKey.LKey])
extends Throwable
with Response
}
}
type Undelete = TweetId => Stitch[Undelete.Response]
object Undelete {
case class Response(
code: UndeleteResponseCode,
tweet: Option[Tweet] = None,
createdAtMillis: Option[Long] = None,
archivedAtMillis: Option[Long] = None)
sealed trait UndeleteResponseCode
object UndeleteResponseCode {
object Success extends UndeleteResponseCode
object BackupNotFound extends UndeleteResponseCode
object NotCreated extends UndeleteResponseCode
}
}
type AddTweet = Tweet => Stitch[Unit]
type UpdateTweet = (Tweet, Seq[Field]) => Stitch[TweetResponse]
type GetDeletedTweets = Seq[TweetId] => Stitch[Seq[DeletedTweetResponse]]
type DeleteAdditionalFields = (Seq[TweetId], Seq[Field]) => Stitch[Seq[TweetResponse]]
type Scrub = (Seq[TweetId], Seq[Field]) => Stitch[Unit]
type Ping = () => Future[Unit]
}

View file

@ -0,0 +1,34 @@
package com.twitter.tweetypie.storage
import scala.util.control.NoStackTrace
sealed abstract class TweetStorageException(message: String, cause: Throwable)
extends Exception(message, cause)
/**
* The request was not properly formed and failed an assertion present in the code. Should not be
* retried without modification.
*/
case class ClientError(message: String, cause: Throwable)
extends TweetStorageException(message, cause)
with NoStackTrace
/**
* Request was rejected by Manhattan or the in-process rate limiter. Should not be retried.
*/
case class RateLimited(message: String, cause: Throwable)
extends TweetStorageException(message, cause)
with NoStackTrace
/**
* Corrupt tweets were requested from Manhattan
*/
case class VersionMismatchError(message: String, cause: Throwable = null)
extends TweetStorageException(message, cause)
with NoStackTrace
/**
* All other unhandled exceptions.
*/
case class InternalError(message: String, cause: Throwable = null)
extends TweetStorageException(message, cause)

View file

@ -0,0 +1,265 @@
package com.twitter.tweetypie.storage
import com.twitter.logging.Logger
import com.twitter.scrooge.TFieldBlob
import com.twitter.snowflake.id.SnowflakeId
import com.twitter.storage.client.manhattan.kv.DeniedManhattanException
import com.twitter.storage.client.manhattan.kv.ManhattanException
import com.twitter.tweetypie.storage.Response._
import com.twitter.tweetypie.storage_internal.thriftscala.StoredTweet
import com.twitter.util.Return
import com.twitter.util.Throw
import com.twitter.util.Try
object TweetUtils {
val log: Logger = Logger("com.twitter.tweetypie.storage.TweetStorageLibrary")
import FieldResponseCodec.ValueNotFoundException
/**
* It's rare, but we have seen tweets with userId=0, which is likely the result of a
* failed/partial delete. Treat these as invalid tweets, which are returned to callers
* as not found.
*/
def isValid(tweet: StoredTweet): Boolean =
tweet.userId.exists(_ != 0) && tweet.text.nonEmpty &&
tweet.createdVia.nonEmpty && tweet.createdAtSec.nonEmpty
/**
* Helper function to extract Scrubbed field Ids from the result returned by reading entire tweet prefix
* function.
*
* @param records The sequence of MH records for the given tweetId
*
* @return The set of scrubbed field ids
*/
private[tweetypie] def extractScrubbedFields(records: Seq[TweetManhattanRecord]): Set[Short] =
records
.map(r => r.lkey)
.collect { case TweetKey.LKey.ScrubbedFieldKey(fieldId) => fieldId }
.toSet
private[tweetypie] val expectedFields =
TweetFields.requiredFieldIds.toSet - TweetFields.tweetIdField
/**
* Find the timestamp from a tweetId and a list of MH records. This is used when
* you need a timestamp and you aren't sure that tweetId is a snowflake id.
*
* @param tweetId A tweetId you want the timestamp for.
* @param records Tbird_mh records keyed on tweetId, one of which should be the
* core fields record.
* @return A milliseconds timestamp if one could be found.
*/
private[tweetypie] def creationTimeFromTweetIdOrMHRecords(
tweetId: Long,
records: Seq[TweetManhattanRecord]
): Option[Long] =
SnowflakeId
.unixTimeMillisOptFromId(tweetId).orElse({
records
.find(_.lkey == TweetKey.LKey.CoreFieldsKey)
.flatMap { coreFields =>
CoreFieldsCodec
.fromTFieldBlob(
TFieldBlobCodec.fromByteBuffer(coreFields.value.contents)
).createdAtSec.map(seconds => seconds * 1000)
}
})
/**
* Helper function used to parse manhattan results for fields in a tweet (given in the form of
* Sequence of (FieldKey, Try[Unit]) pairs) and build a TweetResponse object.
*
* @param callerName The name of the caller function. Used for error messages
* @param tweetId Id of the Tweet for which TweetResponse is being built
* @param fieldResults Sequence of (FieldKey, Try[Unit]).
*
* @return TweetResponse object
*/
private[tweetypie] def buildTweetResponse(
callerName: String,
tweetId: Long,
fieldResults: Map[FieldId, Try[Unit]]
): TweetResponse = {
// Count Found/Not Found
val successCount =
fieldResults.foldLeft(0) {
case (count, (_, Return(_))) => count + 1
case (count, (_, Throw(_: ValueNotFoundException))) => count + 1
case (count, _) => count
}
val fieldResponsesMap = getFieldResponses(callerName, tweetId, fieldResults)
val overallCode = if (successCount > 0 && successCount == fieldResults.size) {
TweetResponseCode.Success
} else {
// If any field was rate limited, then we consider the entire tweet to be rate limited. So first we scan
// the field results to check such an occurrence.
val wasRateLimited = fieldResults.exists { fieldResult =>
fieldResult._2 match {
case Throw(e: DeniedManhattanException) => true
case _ => false
}
}
// Were we rate limited for any of the additional fields?
if (wasRateLimited) {
TweetResponseCode.OverCapacity
} else if (successCount == 0) {
// successCount is < fieldResults.size at this point. So if allOrNone is true or
// if successCount == 0 (i.e failed on all Fields), the overall code should be 'Failure'
TweetResponseCode.Failure
} else {
// allOrNone == false AND successCount > 0 at this point. Clearly the overallCode should be Partial
TweetResponseCode.Partial
}
}
TweetResponse(tweetId, overallCode, Some(fieldResponsesMap))
}
/**
* Helper function to convert manhattan results into a Map[FieldId, FieldResponse]
*
* @param fieldResults Sequence of (TweetKey, TFieldBlob).
*/
private[tweetypie] def getFieldResponses(
callerName: String,
tweetId: TweetId,
fieldResults: Map[FieldId, Try[_]]
): Map[FieldId, FieldResponse] =
fieldResults.map {
case (fieldId, resp) =>
def keyStr = TweetKey.fieldKey(tweetId, fieldId).toString
resp match {
case Return(_) =>
fieldId -> FieldResponse(FieldResponseCode.Success, None)
case Throw(mhException: ManhattanException) =>
val errMsg = s"Exception in $callerName. Key: $keyStr. Error: $mhException"
mhException match {
case _: ValueNotFoundException => // ValueNotFound is not an error
case _ => log.error(errMsg)
}
fieldId -> FieldResponseCodec.fromThrowable(mhException, Some(errMsg))
case Throw(e) =>
val errMsg = s"Exception in $callerName. Key: $keyStr. Error: $e"
log.error(errMsg)
fieldId -> FieldResponse(FieldResponseCode.Error, Some(errMsg))
}
}
/**
* Helper function to build a TweetResponse object when being rate limited. Its possible that only some of the fields
* got rate limited, so we indicate which fields got processed successfully, and which encountered some sort of error.
*
* @param tweetId Tweet id
* @param callerName name of API calling this function
* @param fieldResponses field responses for the case where
*
* @return The TweetResponse object
*/
private[tweetypie] def buildTweetOverCapacityResponse(
callerName: String,
tweetId: Long,
fieldResponses: Map[FieldId, Try[Unit]]
) = {
val fieldResponsesMap = getFieldResponses(callerName, tweetId, fieldResponses)
TweetResponse(tweetId, TweetResponseCode.OverCapacity, Some(fieldResponsesMap))
}
/**
* Build a StoredTweet from a Seq of records. Core fields are handled specially.
*/
private[tweetypie] def buildStoredTweet(
tweetId: TweetId,
records: Seq[TweetManhattanRecord],
includeScrubbed: Boolean = false,
): StoredTweet = {
getStoredTweetBlobs(records, includeScrubbed)
.flatMap { fieldBlob =>
// When fieldId == TweetFields.rootCoreFieldId, we have further work to do since the
// 'value' is really serialized/packed version of all core fields. In this case we'll have
// to unpack it into many TFieldBlobs.
if (fieldBlob.id == TweetFields.rootCoreFieldId) {
// We won't throw any error in this function and instead let the caller function handle this
// condition (i.e If the caller function does not find any values for the core-fields in
// the returned map, it should assume that the tweet is not found)
CoreFieldsCodec.unpackFields(fieldBlob).values.toSeq
} else {
Seq(fieldBlob)
}
}.foldLeft(StoredTweet(tweetId))(_.setField(_))
}
private[tweetypie] def buildValidStoredTweet(
tweetId: TweetId,
records: Seq[TweetManhattanRecord]
): Option[StoredTweet] = {
val storedTweet = buildStoredTweet(tweetId, records)
if (storedTweet.getFieldBlobs(expectedFields).nonEmpty && isValid(storedTweet)) {
Some(storedTweet)
} else {
None
}
}
/**
* Return a TFieldBlob for each StoredTweet field defined in this set of records.
* @param includeScrubbed when false, result will not include scrubbed fields even
* if the data is present in the set of records.
*/
private[tweetypie] def getStoredTweetBlobs(
records: Seq[TweetManhattanRecord],
includeScrubbed: Boolean = false,
): Seq[TFieldBlob] = {
val scrubbed = extractScrubbedFields(records)
records
.flatMap { r =>
// extract LKey.FieldKey records if they are not scrubbed and get their TFieldBlobs
r.key match {
case fullKey @ TweetKey(_, key: TweetKey.LKey.FieldKey)
if includeScrubbed || !scrubbed.contains(key.fieldId) =>
try {
val fieldBlob = TFieldBlobCodec.fromByteBuffer(r.value.contents)
if (fieldBlob.field.id != key.fieldId) {
throw new AssertionError(
s"Blob stored for $fullKey has unexpected id ${fieldBlob.field.id}"
)
}
Some(fieldBlob)
} catch {
case e: VersionMismatchError =>
log.error(
s"Failed to decode bytebuffer for $fullKey: ${e.getMessage}"
)
throw e
}
case _ => None
}
}
}
/**
* Its important to bubble up rate limiting exceptions as they would likely be the root cause for other issues
* (timeouts etc.), so we scan for this particular exception, and if found, we bubble that up specifically
*
* @param seqOfTries The sequence of tries which may contain within it a rate limit exception
*
* @return if a rate limiting exn was detected, this will be a Throw(e: DeniedManhattanException)
* otherwise it will be a Return(_) only if all individual tries succeeded
*/
private[tweetypie] def collectWithRateLimitCheck(seqOfTries: Seq[Try[Unit]]): Try[Unit] = {
val rateLimitThrowOpt = seqOfTries.find {
case Throw(e: DeniedManhattanException) => true
case _ => false
}
rateLimitThrowOpt.getOrElse(
Try.collect(seqOfTries).map(_ => ())
) // Operation is considered successful only if all the deletions are successful
}
}

View file

@ -0,0 +1,106 @@
package com.twitter.tweetypie.storage
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.stitch.Stitch
import com.twitter.tweetypie.storage.TweetStorageClient.Undelete
import com.twitter.tweetypie.storage.TweetUtils._
import com.twitter.util.Time
object UndeleteHandler {
def apply(
read: ManhattanOperations.Read,
localInsert: ManhattanOperations.Insert,
remoteInsert: ManhattanOperations.Insert,
delete: ManhattanOperations.Delete,
undeleteWindowHours: Int,
stats: StatsReceiver
): Undelete = {
def withinUndeleteWindow(timestampMs: Long) =
(Time.now - Time.fromMilliseconds(timestampMs)).inHours < undeleteWindowHours
def prepareUndelete(
tweetId: TweetId,
records: Seq[TweetManhattanRecord]
): (Undelete.Response, Option[TweetManhattanRecord]) = {
val undeleteRecord =
Some(TweetStateRecord.Undeleted(tweetId, Time.now.inMillis).toTweetMhRecord)
TweetStateRecord.mostRecent(records) match {
// check if we need to undo a soft deletion
case Some(TweetStateRecord.SoftDeleted(_, createdAt)) =>
if (createdAt > 0) {
if (withinUndeleteWindow(createdAt)) {
(
mkSuccessfulUndeleteResponse(tweetId, records, Some(createdAt)),
undeleteRecord
)
} else {
(Undelete.Response(Undelete.UndeleteResponseCode.BackupNotFound), None)
}
} else {
throw InternalError(s"Timestamp unavailable for $tweetId")
}
// BounceDeleted tweets may not be undeleted. see go/bouncedtweet
case Some(_: TweetStateRecord.HardDeleted | _: TweetStateRecord.BounceDeleted) =>
(Undelete.Response(Undelete.UndeleteResponseCode.BackupNotFound), None)
case Some(_: TweetStateRecord.Undeleted) =>
// We still want to write the undelete record, because at this point, we only know that the local DC's
// winning record is not a soft/hard deletion record, while its possible that the remote DC's winning
// record might still be a soft deletion record. Having said that, we don't want to set it to true
// if the winning record is forceAdd, as the forceAdd call should have ensured that both DCs had the
// forceAdd record.
(mkSuccessfulUndeleteResponse(tweetId, records), undeleteRecord)
case Some(_: TweetStateRecord.ForceAdded) =>
(mkSuccessfulUndeleteResponse(tweetId, records), None)
// lets write the undeletion record just in case there is a softdeletion record in flight
case None => (mkSuccessfulUndeleteResponse(tweetId, records), undeleteRecord)
}
}
// Write the undelete record both locally and remotely to protect
// against races with hard delete replication. We only need this
// protection for the insertion of the undelete record.
def multiInsert(record: TweetManhattanRecord): Stitch[Unit] =
Stitch
.collect(
Seq(
localInsert(record).liftToTry,
remoteInsert(record).liftToTry
)
)
.map(collectWithRateLimitCheck)
.lowerFromTry
def deleteSoftDeleteRecord(tweetId: TweetId): Stitch[Unit] = {
val mhKey = TweetKey.softDeletionStateKey(tweetId)
delete(mhKey, None)
}
tweetId =>
for {
records <- read(tweetId)
(response, undeleteRecord) = prepareUndelete(tweetId, records)
_ <- Stitch.collect(undeleteRecord.map(multiInsert)).unit
_ <- deleteSoftDeleteRecord(tweetId)
} yield {
response
}
}
private[storage] def mkSuccessfulUndeleteResponse(
tweetId: TweetId,
records: Seq[TweetManhattanRecord],
timestampOpt: Option[Long] = None
) =
Undelete.Response(
Undelete.UndeleteResponseCode.Success,
Some(
StorageConversions.fromStoredTweet(buildStoredTweet(tweetId, records))
),
archivedAtMillis = timestampOpt
)
}

View file

@ -0,0 +1,64 @@
package com.twitter.tweetypie.storage
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.stitch.Stitch
import com.twitter.storage.client.manhattan.kv.DeniedManhattanException
import com.twitter.storage.client.manhattan.kv.ManhattanValue
import com.twitter.tweetypie.storage.TweetUtils._
import com.twitter.tweetypie.thriftscala.Tweet
import com.twitter.util.Throw
import com.twitter.util.Time
object UpdateTweetHandler {
def apply(
insert: ManhattanOperations.Insert,
stats: StatsReceiver
): TweetStorageClient.UpdateTweet = { (tpTweet: Tweet, fields: Seq[Field]) =>
require(
fields.forall(!TweetFields.coreFieldIds.contains(_)),
"Core fields cannot be modified by calling updateTweet; use addTweet instead."
)
require(
areAllFieldsDefined(tpTweet, fields),
s"Input tweet $tpTweet does not have specified fields $fields set"
)
val now = Time.now
val storedTweet = StorageConversions.toStoredTweetForFields(tpTweet, fields.toSet)
val tweetId = storedTweet.id
Stats.updatePerFieldQpsCounters("updateTweet", fields.map(_.id), 1, stats)
val (fieldIds, stitchesPerTweet) =
fields.map { field =>
val fieldId = field.id
val tweetKey = TweetKey.fieldKey(tweetId, fieldId)
val blob = storedTweet.getFieldBlob(fieldId).get
val value = ManhattanValue(TFieldBlobCodec.toByteBuffer(blob), Some(now))
val record = TweetManhattanRecord(tweetKey, value)
(fieldId, insert(record).liftToTry)
}.unzip
Stitch.collect(stitchesPerTweet).map { seqOfTries =>
val fieldkeyAndMhResults = fieldIds.zip(seqOfTries).toMap
// If even a single field was rate limited, we will send an overall OverCapacity TweetResponse
val wasRateLimited = fieldkeyAndMhResults.exists { keyAndResult =>
keyAndResult._2 match {
case Throw(e: DeniedManhattanException) => true
case _ => false
}
}
if (wasRateLimited) {
buildTweetOverCapacityResponse("updateTweets", tweetId, fieldkeyAndMhResults)
} else {
buildTweetResponse("updateTweets", tweetId, fieldkeyAndMhResults)
}
}
}
private def areAllFieldsDefined(tpTweet: Tweet, fields: Seq[Field]) = {
val storedTweet = StorageConversions.toStoredTweetForFields(tpTweet, fields.toSet)
fields.map(_.id).forall(storedTweet.getFieldBlob(_).isDefined)
}
}

View file

@ -0,0 +1,11 @@
package com.twitter.tweetypie
import com.twitter.storage.client.manhattan.kv.ManhattanValue
import java.nio.ByteBuffer
package object storage {
type TweetId = Long
type FieldId = Short
type TweetManhattanValue = ManhattanValue[ByteBuffer]
}

View file

@ -0,0 +1,20 @@
scala_library(
sources = ["*.scala"],
compiler_option_sets = ["fatal_warnings"],
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"finagle/finagle-core/src/main",
"flock-client/src/main/scala",
"flock-client/src/main/thrift:thrift-scala",
"tweetypie/servo/util/src/main/scala",
"snowflake:id",
"src/thrift/com/twitter/gizmoduck:thrift-scala",
"src/thrift/com/twitter/servo:servo-exception-java",
"tweetypie/common/src/thrift/com/twitter/tweetypie:tweet-scala",
"tweetypie/server/src/main/scala/com/twitter/tweetypie",
"tweetypie/server/src/main/scala/com/twitter/tweetypie/serverutil",
"tweetypie/common/src/scala/com/twitter/tweetypie/util",
"util/util-core:scala",
],
)

View file

@ -0,0 +1,532 @@
/** Copyright 2010 Twitter, Inc. */
package com.twitter.tweetypie
package tflock
import com.twitter.finagle.stats.Counter
import com.twitter.flockdb.client._
import com.twitter.flockdb.client.thriftscala.Priority
import com.twitter.snowflake.id.SnowflakeId
import com.twitter.tweetypie.serverutil.StoredCard
import com.twitter.tweetypie.thriftscala._
import com.twitter.util.Future
import scala.collection.mutable.ListBuffer
object TFlockIndexer {
/**
* Printable names for some edge types currently defined in [[com.twitter.flockdb.client]].
* Used to defined stats counters for adding edges.
*/
val graphNames: Map[Int, String] =
Map(
CardTweetsGraph.id -> "card_tweets",
ConversationGraph.id -> "conversation",
DirectedAtUserIdGraph.id -> "directed_at_user_id",
InvitedUsersGraph.id -> "invited_users",
MediaTimelineGraph.id -> "media_timeline",
MentionsGraph.id -> "mentions",
NarrowcastSentTweetsGraph.id -> "narrowcast_sent_tweets",
NullcastedTweetsGraph.id -> "nullcasted_tweets",
QuotersGraph.id -> "quoters",
QuotesGraph.id -> "quotes",
QuoteTweetsIndexGraph.id -> "quote_tweets_index",
RepliesToTweetsGraph.id -> "replies_to_tweets",
RetweetsByMeGraph.id -> "retweets_by_me",
RetweetsGraph.id -> "retweets",
RetweetsOfMeGraph.id -> "retweets_of_me",
RetweetSourceGraph.id -> "retweet_source",
TweetsRetweetedGraph.id -> "tweets_retweeted",
UserTimelineGraph.id -> "user_timeline",
CreatorSubscriptionTimelineGraph.id -> "creator_subscription_timeline",
CreatorSubscriptionMediaTimelineGraph.id -> "creator_subscription_image_timeline",
)
/**
* On edge deletion, edges are either archived permanently or retained for 3 months, based on
* the retention policy in the above confluence page.
*
* These two retention policies correspond to the two deletion techniques: archive and remove.
* We call removeEdges for edges with a short retention policy and archiveEdges for edges with
* a permanent retention policy.
*/
val graphsWithRemovedEdges: Seq[Int] =
Seq(
CardTweetsGraph.id,
CuratedTimelineGraph.id,
CuratedTweetsGraph.id,
DirectedAtUserIdGraph.id,
MediaTimelineGraph.id,
MutedConversationsGraph.id,
QuotersGraph.id,
QuotesGraph.id,
QuoteTweetsIndexGraph.id,
ReportedTweetsGraph.id,
RetweetsOfMeGraph.id,
RetweetSourceGraph.id,
SoftLikesGraph.id,
TweetsRetweetedGraph.id,
CreatorSubscriptionTimelineGraph.id,
CreatorSubscriptionMediaTimelineGraph.id,
)
/**
* These edges should be left in place when bounced tweets are deleted.
* These edges are removed during hard deletion.
*
* This is done so external teams (timelines) can execute on these edges for
* tombstone feature.
*/
val bounceDeleteGraphIds: Set[Int] =
Set(
UserTimelineGraph.id,
ConversationGraph.id
)
def makeCounters(stats: StatsReceiver, operation: String): Map[Int, Counter] = {
TFlockIndexer.graphNames
.mapValues(stats.scope(_).counter(operation))
.withDefaultValue(stats.scope("unknown").counter(operation))
}
}
/**
* @param backgroundIndexingPriority specifies the queue to use for
* background indexing operations. This is useful for making the
* effects of background indexing operations (such as deleting edges
* for deleted Tweets) available sooner in testing scenarios
* (end-to-end tests or development instances). It is set to
* Priority.Low in production to reduce the load on high priority
* queues that we use for prominently user-visible operations.
*/
class TFlockIndexer(
tflock: TFlockClient,
hasMedia: Tweet => Boolean,
backgroundIndexingPriority: Priority,
stats: StatsReceiver)
extends TweetIndexer {
private[this] val FutureNil = Future.Nil
private[this] val archiveCounters = TFlockIndexer.makeCounters(stats, "archive")
private[this] val removeCounters = TFlockIndexer.makeCounters(stats, "remove")
private[this] val insertCounters = TFlockIndexer.makeCounters(stats, "insert")
private[this] val negateCounters = TFlockIndexer.makeCounters(stats, "negate")
private[this] val foregroundIndexingPriority: Priority = Priority.High
override def createIndex(tweet: Tweet): Future[Unit] =
createEdges(tweet, isUndelete = false)
override def undeleteIndex(tweet: Tweet): Future[Unit] =
createEdges(tweet, isUndelete = true)
private[this] case class PartitionedEdges(
longRetention: Seq[ExecuteEdge[StatusGraph]] = Nil,
shortRetention: Seq[ExecuteEdge[StatusGraph]] = Nil,
negate: Seq[ExecuteEdge[StatusGraph]] = Nil,
ignore: Seq[ExecuteEdge[StatusGraph]] = Nil)
private[this] def partitionEdgesForDelete(
edges: Seq[ExecuteEdge[StatusGraph]],
isBounceDelete: Boolean
) =
edges.foldLeft(PartitionedEdges()) {
// Two dependees of UserTimelineGraph edge states to satisfy: timelines & safety tools.
// Timelines show bounce-deleted tweets as tombstones; regular deletes are not shown.
// - i.e. timelineIds = UserTimelineGraph(Normal || Negative)
// Safety tools show deleted tweets to authorized internal review agents
// - i.e. deletedIds = UserTimelineGraph(Removed || Negative)
case (partitionedEdges, edge) if isBounceDelete && edge.graphId == UserTimelineGraph.id =>
partitionedEdges.copy(negate = edge +: partitionedEdges.negate)
case (partitionedEdges, edge) if isBounceDelete && edge.graphId == ConversationGraph.id =>
// Bounce-deleted tweets remain rendered as tombstones in conversations, so do not modify
// the ConversationGraph edge state
partitionedEdges.copy(ignore = edge +: partitionedEdges.ignore)
case (partitionedEdges, edge)
if TFlockIndexer.graphsWithRemovedEdges.contains(edge.graphId) =>
partitionedEdges.copy(shortRetention = edge +: partitionedEdges.shortRetention)
case (partitionedEdges, edge) =>
partitionedEdges.copy(longRetention = edge +: partitionedEdges.longRetention)
}
override def deleteIndex(tweet: Tweet, isBounceDelete: Boolean): Future[Unit] =
for {
edges <- getEdges(tweet, isCreate = false, isDelete = true, isUndelete = false)
partitionedEdges = partitionEdgesForDelete(edges, isBounceDelete)
() <-
Future
.join(
tflock
.archiveEdges(partitionedEdges.longRetention, backgroundIndexingPriority)
.onSuccess(_ =>
partitionedEdges.longRetention.foreach(e => archiveCounters(e.graphId).incr())),
tflock
.removeEdges(partitionedEdges.shortRetention, backgroundIndexingPriority)
.onSuccess(_ =>
partitionedEdges.shortRetention.foreach(e => removeCounters(e.graphId).incr())),
tflock
.negateEdges(partitionedEdges.negate, backgroundIndexingPriority)
.onSuccess(_ =>
partitionedEdges.negate.foreach(e => negateCounters(e.graphId).incr()))
)
.unit
} yield ()
/**
* This operation is called when a user is put into or taken out of
* a state in which their retweets should no longer be visible
* (e.g. suspended or ROPO).
*/
override def setRetweetVisibility(retweetId: TweetId, setVisible: Boolean): Future[Unit] = {
val retweetEdge = Seq(ExecuteEdge(retweetId, RetweetsGraph, None, Reverse))
if (setVisible) {
tflock
.insertEdges(retweetEdge, backgroundIndexingPriority)
.onSuccess(_ => insertCounters(RetweetsGraph.id).incr())
} else {
tflock
.archiveEdges(retweetEdge, backgroundIndexingPriority)
.onSuccess(_ => archiveCounters(RetweetsGraph.id).incr())
}
}
private[this] def createEdges(tweet: Tweet, isUndelete: Boolean): Future[Unit] =
for {
edges <- getEdges(tweet = tweet, isCreate = true, isDelete = false, isUndelete = isUndelete)
() <- tflock.insertEdges(edges, foregroundIndexingPriority)
} yield {
// Count all the edges we've successfully added:
edges.foreach(e => insertCounters(e.graphId).incr())
}
private[this] def addRTEdges(
tweet: Tweet,
share: Share,
isCreate: Boolean,
edges: ListBuffer[ExecuteEdge[StatusGraph]],
futureEdges: ListBuffer[Future[Seq[ExecuteEdge[StatusGraph]]]]
): Unit = {
edges += RetweetsOfMeGraph.edge(share.sourceUserId, tweet.id)
edges += RetweetsByMeGraph.edge(getUserId(tweet), tweet.id)
edges += RetweetsGraph.edge(share.sourceStatusId, tweet.id)
if (isCreate) {
edges += ExecuteEdge(
sourceId = getUserId(tweet),
graph = RetweetSourceGraph,
destinationIds = Some(Seq(share.sourceStatusId)),
direction = Forward,
position = Some(SnowflakeId(tweet.id).time.inMillis)
)
edges.append(TweetsRetweetedGraph.edge(share.sourceUserId, share.sourceStatusId))
} else {
edges += RetweetSourceGraph.edge(getUserId(tweet), share.sourceStatusId)
// if this is the last retweet we need to remove it from the source user's
// tweets retweeted graph
futureEdges.append(
tflock.count(RetweetsGraph.from(share.sourceStatusId)).flatMap { count =>
if (count <= 1) {
tflock.selectAll(RetweetsGraph.from(share.sourceStatusId)).map { tweets =>
if (tweets.size <= 1)
Seq(TweetsRetweetedGraph.edge(share.sourceUserId, share.sourceStatusId))
else
Nil
}
} else {
FutureNil
}
}
)
}
}
private[this] def addReplyEdges(
tweet: Tweet,
edges: ListBuffer[ExecuteEdge[StatusGraph]]
): Unit = {
getReply(tweet).foreach { reply =>
reply.inReplyToStatusId.flatMap { inReplyToStatusId =>
edges += RepliesToTweetsGraph.edge(inReplyToStatusId, tweet.id)
// only index conversationId if this is a reply to another tweet
TweetLenses.conversationId.get(tweet).map { conversationId =>
edges += ConversationGraph.edge(conversationId, tweet.id)
}
}
}
}
private[this] def addDirectedAtEdges(
tweet: Tweet,
edges: ListBuffer[ExecuteEdge[StatusGraph]]
): Unit = {
TweetLenses.directedAtUser.get(tweet).foreach { directedAtUser =>
edges += DirectedAtUserIdGraph.edge(directedAtUser.userId, tweet.id)
}
}
private[this] def addMentionEdges(
tweet: Tweet,
edges: ListBuffer[ExecuteEdge[StatusGraph]]
): Unit = {
getMentions(tweet)
.flatMap(_.userId).foreach { mention =>
edges += MentionsGraph.edge(mention, tweet.id)
}
}
private[this] def addQTEdges(
tweet: Tweet,
edges: ListBuffer[ExecuteEdge[StatusGraph]],
futureEdges: ListBuffer[Future[Seq[ExecuteEdge[StatusGraph]]]],
isCreate: Boolean
): Unit = {
val userId = getUserId(tweet)
tweet.quotedTweet.foreach { quotedTweet =>
// Regardless of tweet creates/deletes, we add the corresponding edges to the
// following two graphs. Note that we're handling the case for
// the QuotersGraph slightly differently in the tweet delete case.
edges.append(QuotesGraph.edge(quotedTweet.userId, tweet.id))
edges.append(QuoteTweetsIndexGraph.edge(quotedTweet.tweetId, tweet.id))
if (isCreate) {
// As mentioned above, for tweet creates we go ahead and add an edge
// to the QuotersGraph without any additional checks.
edges.append(QuotersGraph.edge(quotedTweet.tweetId, userId))
} else {
// For tweet deletes, we only add an edge to be deleted from the
// QuotersGraph if the tweeting user isn't quoting the tweet anymore
// i.e. if a user has quoted a tweet multiple times, we only delete
// an edge from the QuotersGraph if they've deleted all the quotes,
// otherwise an edge should exist by definition of what the QuotersGraph
// represents.
// Note: There can be a potential edge case here due to a race condition
// in the following scenario.
// i) A quotes a tweet T twice resulting in tweets T1 and T2.
// ii) There should exist edges in the QuotersGraph from T -> A and T1 <-> T, T2 <-> T in
// the QuoteTweetsIndexGraph, but one of the edges haven't been written
// to the QuoteTweetsIndex graph in TFlock yet.
// iii) In this scenario, we shouldn't really be deleting an edge as we're doing below.
// The approach that we're taking below is a "best effort" approach similar to what we
// currently do for RTs.
// Find all the quotes of the quoted tweet from the quoting user
val quotesFromQuotingUser = QuoteTweetsIndexGraph
.from(quotedTweet.tweetId)
.intersect(UserTimelineGraph.from(userId))
futureEdges.append(
tflock
.count(quotesFromQuotingUser).flatMap { count =>
// If this is the last quote of the quoted tweet from the quoting user,
// we go ahead and delete the edge from the QuotersGraph.
if (count <= 1) {
tflock.selectAll(quotesFromQuotingUser).map { tweets =>
if (tweets.size <= 1) {
Seq(QuotersGraph.edge(quotedTweet.tweetId, userId))
} else {
Nil
}
}
} else {
FutureNil
}
}
)
}
}
}
private[this] def addCardEdges(
tweet: Tweet,
edges: ListBuffer[ExecuteEdge[StatusGraph]]
): Unit = {
// Note that we are indexing only the TOO "stored" cards
// (cardUri=card://<cardId>). Rest of the cards are ignored here.
tweet.cardReference
.collect {
case StoredCard(id) =>
edges.append(CardTweetsGraph.edge(id, tweet.id))
}.getOrElse(())
}
// Note: on undelete, this method restores all archived edges, including those that may have
// been archived prior to the delete. This is incorrect behavior but in practice rarely
// causes problems, as undeletes are so rare.
private[this] def addEdgesForDeleteOrUndelete(
tweet: Tweet,
edges: ListBuffer[ExecuteEdge[StatusGraph]]
): Unit = {
edges.appendAll(
Seq(
MentionsGraph.edges(tweet.id, None, Reverse),
RepliesToTweetsGraph.edges(tweet.id, None)
)
)
// When we delete or undelete a conversation control root Tweet we want to archive or restore
// all the edges in InvitedUsersGraph from the Tweet id.
if (hasConversationControl(tweet) && isConversationRoot(tweet)) {
edges.append(InvitedUsersGraph.edges(tweet.id, None))
}
}
private[this] def addSimpleEdges(
tweet: Tweet,
edges: ListBuffer[ExecuteEdge[StatusGraph]]
): Unit = {
if (TweetLenses.nullcast.get(tweet)) {
edges.append(NullcastedTweetsGraph.edge(getUserId(tweet), tweet.id))
} else if (TweetLenses.narrowcast.get(tweet).isDefined) {
edges.append(NarrowcastSentTweetsGraph.edge(getUserId(tweet), tweet.id))
} else {
edges.append(UserTimelineGraph.edge(getUserId(tweet), tweet.id))
if (hasMedia(tweet))
edges.append(MediaTimelineGraph.edge(getUserId(tweet), tweet.id))
// Index root creator subscription tweets.
// Ignore replies because those are not necessarily visible to a user who subscribes to tweet author
val isRootTweet: Boolean = tweet.coreData match {
case Some(c) => c.reply.isEmpty && c.share.isEmpty
case None => true
}
if (tweet.exclusiveTweetControl.isDefined && isRootTweet) {
edges.append(CreatorSubscriptionTimelineGraph.edge(getUserId(tweet), tweet.id))
if (hasMedia(tweet))
edges.append(CreatorSubscriptionMediaTimelineGraph.edge(getUserId(tweet), tweet.id))
}
}
}
/**
* Issues edges for each mention of user in a conversation-controlled tweet. This way InvitedUsers
* graph accumulates complete set of ids for @mention-invited users, by conversation id.
*/
private def invitedUsersEdgesForCreate(
tweet: Tweet,
edges: ListBuffer[ExecuteEdge[StatusGraph]]
): Unit = {
val conversationId: Long = getConversationId(tweet).getOrElse(tweet.id)
val mentions: Seq[UserId] = getMentions(tweet).flatMap(_.userId)
edges.appendAll(mentions.map(userId => InvitedUsersGraph.edge(conversationId, userId)))
}
/**
* Issues edges of InviteUsersGraph that ought to be deleted for a conversation controlled reply.
* These are mentions of users in the given tweet, only if the user was not mentioned elsewhere
* in the conversation. This way for a conversation, InvitedUsersGraph would always hold a set
* of all users invited to the conversation, and an edge is removed only after the last mention of
* a user is deleted.
*/
private def invitedUsersEdgesForDelete(
tweet: Tweet,
futureEdges: ListBuffer[Future[Seq[ExecuteEdge[StatusGraph]]]]
): Unit = {
getConversationId(tweet).foreach { conversationId: Long =>
val mentions: Seq[UserId] = getMentions(tweet).flatMap(_.userId)
mentions.foreach { userId =>
val tweetIdsWithinConversation = ConversationGraph.from(conversationId)
val tweetIdsThatMentionUser = MentionsGraph.from(userId)
futureEdges.append(
tflock
.selectAll(
query = tweetIdsThatMentionUser.intersect(tweetIdsWithinConversation),
limit = Some(2), // Just need to know if it is >1 or <=1, so 2 are enough.
pageSize = None // Provide default, otherwise Mockito complains
).map { tweetIds: Seq[Long] =>
if (tweetIds.size <= 1) {
Seq(InvitedUsersGraph.edge(conversationId, userId))
} else {
Nil
}
}
)
}
}
}
private def hasInviteViaMention(tweet: Tweet): Boolean = {
tweet.conversationControl match {
case Some(ConversationControl.ByInvitation(controls)) =>
controls.inviteViaMention.getOrElse(false)
case Some(ConversationControl.Community(controls)) =>
controls.inviteViaMention.getOrElse(false)
case Some(ConversationControl.Followers(followers)) =>
followers.inviteViaMention.getOrElse(false)
case _ =>
false
}
}
private def hasConversationControl(tweet: Tweet): Boolean =
tweet.conversationControl.isDefined
// If a Tweet has a ConversationControl, it must have a ConversationId associated with it so we
// can compare the ConversationId with the current Tweet ID to determine if it's the root of the
// conversation. See ConversationIdHydrator for more details
private def isConversationRoot(tweet: Tweet): Boolean =
getConversationId(tweet).get == tweet.id
private def addInvitedUsersEdges(
tweet: Tweet,
isCreate: Boolean,
isUndelete: Boolean,
edges: ListBuffer[ExecuteEdge[StatusGraph]],
futureEdges: ListBuffer[Future[Seq[ExecuteEdge[StatusGraph]]]]
): Unit = {
if (hasConversationControl(tweet)) {
if (isCreate) {
if (isConversationRoot(tweet) && !isUndelete) {
// For root Tweets, only add edges for original creates, not for undeletes.
// Undeletes are handled by addEdgesForDeleteOrUndelete.
invitedUsersEdgesForCreate(tweet, edges)
}
if (!isConversationRoot(tweet) && hasInviteViaMention(tweet)) {
// For replies, only add edges when the conversation control is in inviteViaMention mode.
invitedUsersEdgesForCreate(tweet, edges)
}
} else {
if (!isConversationRoot(tweet)) {
invitedUsersEdgesForDelete(tweet, futureEdges)
}
}
}
}
private[this] def getEdges(
tweet: Tweet,
isCreate: Boolean,
isDelete: Boolean,
isUndelete: Boolean
): Future[Seq[ExecuteEdge[StatusGraph]]] = {
val edges = ListBuffer[ExecuteEdge[StatusGraph]]()
val futureEdges = ListBuffer[Future[Seq[ExecuteEdge[StatusGraph]]]]()
addSimpleEdges(tweet, edges)
getShare(tweet) match {
case Some(share) => addRTEdges(tweet, share, isCreate, edges, futureEdges)
case _ =>
addInvitedUsersEdges(tweet, isCreate, isUndelete, edges, futureEdges)
addReplyEdges(tweet, edges)
addDirectedAtEdges(tweet, edges)
addMentionEdges(tweet, edges)
addQTEdges(tweet, edges, futureEdges, isCreate)
addCardEdges(tweet, edges)
if (isDelete || isUndelete) {
addEdgesForDeleteOrUndelete(tweet, edges)
}
}
Future
.collect(futureEdges)
.map { moreEdges => (edges ++= moreEdges.flatten).toList }
}
}

View file

@ -0,0 +1,30 @@
/** Copyright 2010 Twitter, Inc. */
package com.twitter.tweetypie
package tflock
import com.twitter.tweetypie.thriftscala.Tweet
import com.twitter.util.Future
trait TweetIndexer {
/**
* Called at tweet-creation time, this method should set up all relevant indices on the tweet.
*/
def createIndex(tweet: Tweet): Future[Unit] = Future.Unit
/**
* Called at tweet-undelete time (which isn't yet handled), this method should
* restore all relevant indices on the tweet.
*/
def undeleteIndex(tweet: Tweet): Future[Unit] = Future.Unit
/**
* Called at tweet-delete time, this method should archive all relevant indices on the tweet.
*/
def deleteIndex(tweet: Tweet, isBounceDelete: Boolean): Future[Unit] = Future.Unit
/**
* This method should archive or unarchive the retweet edge in TFlock RetweetsGraph.
*/
def setRetweetVisibility(retweetId: TweetId, visible: Boolean): Future[Unit] = Future.Unit
}

View file

@ -0,0 +1,13 @@
scala_library(
sources = ["*.scala"],
compiler_option_sets = ["fatal_warnings"],
platform = "java8",
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"finagle/finagle-core/src/main",
"scrooge/scrooge-core/src/main/scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:service-scala",
"util/util-core:scala",
],
)

View file

@ -0,0 +1,8 @@
package com.twitter.tweetypie.thriftscala
import com.twitter.finagle.service.FailedService
class NotImplementedTweetService
extends TweetService$FinagleClient(
new FailedService(new UnsupportedOperationException("not implemented"))
)

View file

@ -0,0 +1,79 @@
package com.twitter.tweetypie.thriftscala
import com.twitter.util.Future
/**
* A trait for TweetService implementations that wrap an underlying
* TweetService and need to modify only some of the methods.
*/
trait TweetServiceProxy extends TweetService.MethodPerEndpoint {
protected def underlying: TweetService.MethodPerEndpoint
/**
* Default implementation simply passes through the Future but logic can be added to wrap each
* invocation to the underlying TweetService
*/
protected def wrap[A](f: => Future[A]): Future[A] =
f
override def getTweets(request: GetTweetsRequest): Future[Seq[GetTweetResult]] =
wrap(underlying.getTweets(request))
override def getTweetFields(request: GetTweetFieldsRequest): Future[Seq[GetTweetFieldsResult]] =
wrap(underlying.getTweetFields(request))
override def getTweetCounts(request: GetTweetCountsRequest): Future[Seq[GetTweetCountsResult]] =
wrap(underlying.getTweetCounts(request))
override def setAdditionalFields(request: SetAdditionalFieldsRequest): Future[Unit] =
wrap(underlying.setAdditionalFields(request))
override def deleteAdditionalFields(request: DeleteAdditionalFieldsRequest): Future[Unit] =
wrap(underlying.deleteAdditionalFields(request))
override def postTweet(request: PostTweetRequest): Future[PostTweetResult] =
wrap(underlying.postTweet(request))
override def postRetweet(request: RetweetRequest): Future[PostTweetResult] =
wrap(underlying.postRetweet(request))
override def unretweet(request: UnretweetRequest): Future[UnretweetResult] =
wrap(underlying.unretweet(request))
override def getDeletedTweets(
request: GetDeletedTweetsRequest
): Future[Seq[GetDeletedTweetResult]] =
wrap(underlying.getDeletedTweets(request))
override def deleteTweets(request: DeleteTweetsRequest): Future[Seq[DeleteTweetResult]] =
wrap(underlying.deleteTweets(request))
override def updatePossiblySensitiveTweet(
request: UpdatePossiblySensitiveTweetRequest
): Future[Unit] =
wrap(underlying.updatePossiblySensitiveTweet(request))
override def undeleteTweet(request: UndeleteTweetRequest): Future[UndeleteTweetResponse] =
wrap(underlying.undeleteTweet(request))
override def eraseUserTweets(request: EraseUserTweetsRequest): Future[Unit] =
wrap(underlying.eraseUserTweets(request))
override def incrTweetFavCount(request: IncrTweetFavCountRequest): Future[Unit] =
wrap(underlying.incrTweetFavCount(request))
override def deleteLocationData(request: DeleteLocationDataRequest): Future[Unit] =
wrap(underlying.deleteLocationData(request))
override def scrubGeo(request: GeoScrub): Future[Unit] =
wrap(underlying.scrubGeo(request))
override def takedown(request: TakedownRequest): Future[Unit] =
wrap(underlying.takedown(request))
override def flush(request: FlushRequest): Future[Unit] =
wrap(underlying.flush(request))
override def incrTweetBookmarkCount(request: IncrTweetBookmarkCountRequest): Future[Unit] =
wrap(underlying.incrTweetBookmarkCount(request))
}

View file

@ -0,0 +1,15 @@
scala_library(
sources = ["*.scala"],
compiler_option_sets = ["fatal_warnings"],
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"tweetypie/servo/util",
"tweetypie/common/src/thrift/com/twitter/tweetypie:media-entity-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:tweet-scala",
"tco-util",
"tweetypie/common/src/scala/com/twitter/tweetypie/tweettext",
"tweetypie/common/src/scala/com/twitter/tweetypie/util",
"twitter-text/lib/java/src/main/java/com/twitter/twittertext",
],
)

View file

@ -0,0 +1,11 @@
package com.twitter.tweetypie.thriftscala.entities
import com.twitter.tweetypie.thriftscala.CashtagEntity
import com.twitter.tweetypie.tweettext.TextEntity
object CashtagTextEntity extends TextEntity[CashtagEntity] {
override def fromIndex(entity: CashtagEntity): Short = entity.fromIndex
override def toIndex(entity: CashtagEntity): Short = entity.toIndex
override def move(entity: CashtagEntity, fromIndex: Short, toIndex: Short): CashtagEntity =
entity.copy(fromIndex = fromIndex, toIndex = toIndex)
}

View file

@ -0,0 +1,118 @@
package com.twitter.tweetypie.thriftscala.entities
import com.twitter.servo.data.Mutation
import com.twitter.tco_util.TcoUrl
import com.twitter.tweetypie.thriftscala._
import com.twitter.tweetypie.thriftscala.entities.Implicits._
import com.twitter.tweetypie.tweettext.PartialHtmlEncoding
import com.twitter.tweetypie.tweettext.TextEntity
import com.twitter.tweetypie.tweettext.TextModification
import com.twitter.tweetypie.util.TweetLenses
import com.twitter.twittertext.Extractor
import scala.collection.JavaConverters._
/**
* Contains functions to collect urls, mentions, hashtags, and cashtags from the text of tweets and messages
*/
object EntityExtractor {
// We only use one configuration of com.twitter.twittertext.Extractor, so it's
// OK to share one global reference. The only available
// configuration option is whether to extract URLs without protocols
// (defaults to true)
private[this] val extractor = new Extractor
// The twitter-text library operates on unencoded text, but we store
// and process HTML-encoded text. The TextModification returned
// from this function contains the decoded text which we will operate on,
// but also provides us with the ability to map the indices on
// the twitter-text entities back to the entities on the encoded text.
private val htmlEncodedTextToEncodeModification: String => TextModification =
text =>
PartialHtmlEncoding
.decodeWithModification(text)
.getOrElse(TextModification.identity(text))
.inverse
private[this] val extractAllUrlsFromTextMod: TextModification => Seq[UrlEntity] =
extractUrls(false)
val extractAllUrls: String => Seq[UrlEntity] =
htmlEncodedTextToEncodeModification.andThen(extractAllUrlsFromTextMod)
private[this] val extractTcoUrls: TextModification => Seq[UrlEntity] =
extractUrls(true)
private[this] def extractUrls(tcoOnly: Boolean): TextModification => Seq[UrlEntity] =
mkEntityExtractor[UrlEntity](
extractor.extractURLsWithIndices(_).asScala.filter { e =>
if (tcoOnly) TcoUrl.isTcoUrl(e.getValue) else true
},
UrlEntity(_, _, _)
)
private[this] val extractMentionsFromTextMod: TextModification => Seq[MentionEntity] =
mkEntityExtractor[MentionEntity](
extractor.extractMentionedScreennamesWithIndices(_).asScala,
MentionEntity(_, _, _)
)
val extractMentions: String => Seq[MentionEntity] =
htmlEncodedTextToEncodeModification.andThen(extractMentionsFromTextMod)
private[this] val extractHashtagsFromTextMod: TextModification => Seq[HashtagEntity] =
mkEntityExtractor[HashtagEntity](
extractor.extractHashtagsWithIndices(_).asScala,
HashtagEntity(_, _, _)
)
val extractHashtags: String => Seq[HashtagEntity] =
htmlEncodedTextToEncodeModification.andThen(extractHashtagsFromTextMod)
private[this] val extractCashtagsFromTextMod: TextModification => Seq[CashtagEntity] =
mkEntityExtractor[CashtagEntity](
extractor.extractCashtagsWithIndices(_).asScala,
CashtagEntity(_, _, _)
)
val extractCashtags: String => Seq[CashtagEntity] =
htmlEncodedTextToEncodeModification.andThen(extractCashtagsFromTextMod)
private[this] def mkEntityExtractor[E: TextEntity](
extract: String => Seq[Extractor.Entity],
construct: (Short, Short, String) => E
): TextModification => Seq[E] =
htmlEncodedMod => {
val convert: Extractor.Entity => Option[E] =
e =>
for {
start <- asShort(e.getStart.intValue)
end <- asShort(e.getEnd.intValue)
if e.getValue != null
res <- htmlEncodedMod.reindexEntity(construct(start, end, e.getValue))
} yield res
val entities = extract(htmlEncodedMod.original)
extractor.modifyIndicesFromUTF16ToUnicode(htmlEncodedMod.original, entities.asJava)
entities.map(convert).flatten
}
private[this] def asShort(i: Int): Option[Short] =
if (i.isValidShort) Some(i.toShort) else None
private[this] def mutation(extractUrls: Boolean): Mutation[Tweet] =
Mutation { tweet =>
val htmlEncodedMod = htmlEncodedTextToEncodeModification(TweetLenses.text.get(tweet))
Some(
tweet.copy(
urls = if (extractUrls) Some(extractTcoUrls(htmlEncodedMod)) else tweet.urls,
mentions = Some(extractMentionsFromTextMod(htmlEncodedMod)),
hashtags = Some(extractHashtagsFromTextMod(htmlEncodedMod)),
cashtags = Some(extractCashtagsFromTextMod(htmlEncodedMod))
)
)
}
val mutationWithoutUrls: Mutation[Tweet] = mutation(false)
val mutationAll: Mutation[Tweet] = mutation(true)
}

View file

@ -0,0 +1,11 @@
package com.twitter.tweetypie.thriftscala.entities
import com.twitter.tweetypie.thriftscala.HashtagEntity
import com.twitter.tweetypie.tweettext.TextEntity
object HashtagTextEntity extends TextEntity[HashtagEntity] {
override def fromIndex(entity: HashtagEntity): Short = entity.fromIndex
override def toIndex(entity: HashtagEntity): Short = entity.toIndex
override def move(entity: HashtagEntity, fromIndex: Short, toIndex: Short): HashtagEntity =
entity.copy(fromIndex = fromIndex, toIndex = toIndex)
}

View file

@ -0,0 +1,10 @@
package com.twitter.tweetypie.thriftscala.entities
object Implicits {
implicit val hashtagTextEntity: HashtagTextEntity.type = HashtagTextEntity
implicit val cashtagTextEntity: CashtagTextEntity.type = CashtagTextEntity
implicit val mentionTextEntity: MentionTextEntity.type = MentionTextEntity
implicit val urlTextEntity: UrlTextEntity.type = UrlTextEntity
implicit val mediaTextEntity: MediaTextEntity.type = MediaTextEntity
implicit val textRangeTextEntity: TextRangeEntityAdapter.type = TextRangeEntityAdapter
}

View file

@ -0,0 +1,11 @@
package com.twitter.tweetypie.thriftscala.entities
import com.twitter.tweetypie.thriftscala.MediaEntity
import com.twitter.tweetypie.tweettext.TextEntity
object MediaTextEntity extends TextEntity[MediaEntity] {
override def fromIndex(entity: MediaEntity): Short = entity.fromIndex
override def toIndex(entity: MediaEntity): Short = entity.toIndex
override def move(entity: MediaEntity, fromIndex: Short, toIndex: Short): MediaEntity =
entity.copy(fromIndex = fromIndex, toIndex = toIndex)
}

View file

@ -0,0 +1,11 @@
package com.twitter.tweetypie.thriftscala.entities
import com.twitter.tweetypie.thriftscala.MentionEntity
import com.twitter.tweetypie.tweettext.TextEntity
object MentionTextEntity extends TextEntity[MentionEntity] {
override def fromIndex(entity: MentionEntity): Short = entity.fromIndex
override def toIndex(entity: MentionEntity): Short = entity.toIndex
override def move(entity: MentionEntity, fromIndex: Short, toIndex: Short): MentionEntity =
entity.copy(fromIndex = fromIndex, toIndex = toIndex)
}

View file

@ -0,0 +1,11 @@
package com.twitter.tweetypie.thriftscala.entities
import com.twitter.tweetypie.thriftscala.TextRange
import com.twitter.tweetypie.tweettext.TextEntity
object TextRangeEntityAdapter extends TextEntity[TextRange] {
override def fromIndex(entity: TextRange): Short = entity.fromIndex.toShort
override def toIndex(entity: TextRange): Short = entity.toIndex.toShort
override def move(entity: TextRange, fromIndex: Short, toIndex: Short): TextRange =
entity.copy(fromIndex = fromIndex, toIndex = toIndex)
}

View file

@ -0,0 +1,11 @@
package com.twitter.tweetypie.thriftscala.entities
import com.twitter.tweetypie.thriftscala.UrlEntity
import com.twitter.tweetypie.tweettext.TextEntity
object UrlTextEntity extends TextEntity[UrlEntity] {
override def fromIndex(entity: UrlEntity): Short = entity.fromIndex
override def toIndex(entity: UrlEntity): Short = entity.toIndex
override def move(entity: UrlEntity, fromIndex: Short, toIndex: Short): UrlEntity =
entity.copy(fromIndex = fromIndex, toIndex = toIndex)
}

View file

@ -0,0 +1,16 @@
scala_library(
sources = ["*.scala"],
compiler_option_sets = ["fatal_warnings"],
platform = "java8",
provides = scala_artifact(
org = "com.twitter",
name = "tweetypie-tweettext",
repo = artifactory,
),
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/ibm/icu:icu4j",
"twitter-text/lib/java/src/main/java/com/twitter/twittertext",
],
)

View file

@ -0,0 +1,44 @@
package com.twitter.tweetypie.tweettext
import com.ibm.icu.text.BreakIterator
/**
* Adapt the [[BreakIterator]] interface to a scala [[Iterator]]
* over the offsets of user-perceived characters in a String.
*/
object GraphemeIndexIterator {
/**
* Produce an iterator over indices in the string that mark the end
* of a user-perceived character (grapheme)
*/
def ends(s: String): Iterator[Offset.CodeUnit] =
// The start of every grapheme but the first is also a grapheme
// end. The last grapheme ends at the end of the string.
starts(s).drop(1) ++ Iterator(Offset.CodeUnit.length(s))
/**
* Produce an iterator over indices in the string that mark the start
* of a user-perceived character (grapheme)
*/
def starts(s: String): Iterator[Offset.CodeUnit] =
new Iterator[Offset.CodeUnit] {
private[this] val it = BreakIterator.getCharacterInstance()
it.setText(s)
override def hasNext: Boolean = it.current < s.length
override def next: Offset.CodeUnit = {
if (!hasNext) throw new IllegalArgumentException(s"${it.current()}, ${s.length}")
// No matter what, we will be returning the value of `current`,
// which is the index of the start of the next grapheme.
val result = it.current()
it.next()
Offset.CodeUnit(result)
}
}
}

View file

@ -0,0 +1,85 @@
package com.twitter.tweetypie.tweettext
/**
* An efficient converter of indices between code points and code units.
*/
class IndexConverter(text: String) {
// Keep track of a single corresponding pair of code unit and code point
// offsets so that we can re-use counting work if the next requested
// entity is near the most recent entity.
private var codePointIndex = 0
// The code unit index should never split a surrogate pair.
private var charIndex = 0
/**
* @param offset Index into the string measured in code units.
* @return The code point index that corresponds to the specified character index.
*/
def toCodePoints(offset: Offset.CodeUnit): Offset.CodePoint =
Offset.CodePoint(codeUnitsToCodePoints(offset.toInt))
/**
* @param charIndex Index into the string measured in code units.
* @return The code point index that corresponds to the specified character index.
*/
def codeUnitsToCodePoints(charIndex: Int): Int = {
if (charIndex < this.charIndex) {
this.codePointIndex -= text.codePointCount(charIndex, this.charIndex)
} else {
this.codePointIndex += text.codePointCount(this.charIndex, charIndex)
}
this.charIndex = charIndex
// Make sure that charIndex never points to the second code unit of a
// surrogate pair.
if (charIndex > 0 && Character.isSupplementaryCodePoint(text.codePointAt(charIndex - 1))) {
this.charIndex -= 1
this.codePointIndex -= 1
}
this.codePointIndex
}
/**
* @param offset Index into the string measured in code points.
* @return the corresponding code unit index
*/
def toCodeUnits(offset: Offset.CodePoint): Offset.CodeUnit = {
this.charIndex = text.offsetByCodePoints(charIndex, offset.toInt - this.codePointIndex)
this.codePointIndex = offset.toInt
Offset.CodeUnit(this.charIndex)
}
/**
* @param codePointIndex Index into the string measured in code points.
* @return the corresponding code unit index
*/
def codePointsToCodeUnits(codePointIndex: Int): Int =
toCodeUnits(Offset.CodePoint(codePointIndex)).toInt
/**
* Returns a substring which begins at the specified code point `from` and extends to the
* code point `to`. Since String.substring only works with character, the method first
* converts code point offset to code unit offset.
*/
def substring(from: Offset.CodePoint, to: Offset.CodePoint): String =
text.substring(toCodeUnits(from).toInt, toCodeUnits(to).toInt)
/**
* Returns a substring which begins at the specified code point `from` and extends to the
* code point `to`. Since String.substring only works with character, the method first
* converts code point offset to code unit offset.
*/
def substringByCodePoints(from: Int, to: Int): String =
substring(Offset.CodePoint(from), Offset.CodePoint(to))
/**
* Returns a substring which begins at the specified code point `from` and extends to the
* end of the string. Since String.substring only works with character, the method first
* converts code point offset to code unit offset.
*/
def substringByCodePoints(from: Int): String = {
val charFrom = codePointsToCodeUnits(from)
text.substring(charFrom)
}
}

View file

@ -0,0 +1,253 @@
package com.twitter.tweetypie.tweettext
import scala.collection.immutable
/**
* An Offset is a typed index into a String.
*/
trait Offset[T] extends Ordering[T] {
def toInt(t: T): Int
def count(text: String, start: Offset.CodeUnit, end: Offset.CodeUnit): T
def compare(t1: T, t2: T): Int = toInt(t1).compare(toInt(t2))
def length(input: String): T = count(input, Offset.CodeUnit(0), Offset.CodeUnit.length(input))
}
object Offset {
/**
* UTF-16 code unit offsets are the native offsets for Java/Scala
* Strings.
*/
case class CodeUnit(toInt: Int) extends AnyVal with Ordered[CodeUnit] {
def compare(other: CodeUnit): Int = toInt.compare(other.toInt)
def +(other: CodeUnit) = CodeUnit(toInt + other.toInt)
def -(other: CodeUnit) = CodeUnit(toInt - other.toInt)
def min(other: CodeUnit): CodeUnit = if (toInt < other.toInt) this else other
def max(other: CodeUnit): CodeUnit = if (toInt > other.toInt) this else other
def incr: CodeUnit = CodeUnit(toInt + 1)
def decr: CodeUnit = CodeUnit(toInt - 1)
def until(end: CodeUnit): immutable.IndexedSeq[CodeUnit] =
toInt.until(end.toInt).map(CodeUnit(_))
/**
* Converts this `CodeUnit` to the equivalent `CodePoint` within the
* given text.
*/
def toCodePoint(text: String): CodePoint =
CodePoint(text.codePointCount(0, toInt))
def offsetByCodePoints(text: String, codePoints: CodePoint): CodeUnit =
CodeUnit(text.offsetByCodePoints(toInt, codePoints.toInt))
}
implicit object CodeUnit extends Offset[CodeUnit] {
def toInt(u: CodeUnit): Int = u.toInt
override def length(text: String): CodeUnit = CodeUnit(text.length)
def count(text: String, start: CodeUnit, end: CodeUnit): CodeUnit = end - start
}
/**
* Offsets in whole Unicode code points. Any CodePoint is a valid
* offset into the String as long as it is >= 0 and less than the
* number of code points in the string.
*/
case class CodePoint(toInt: Int) extends AnyVal with Ordered[CodePoint] {
def toShort: Short = toInt.toShort
def compare(other: CodePoint): Int = toInt.compare(other.toInt)
def +(other: CodePoint) = CodePoint(toInt + other.toInt)
def -(other: CodePoint) = CodePoint(toInt - other.toInt)
def min(other: CodePoint): CodePoint = if (toInt < other.toInt) this else other
def max(other: CodePoint): CodePoint = if (toInt > other.toInt) this else other
def until(end: CodePoint): immutable.IndexedSeq[CodePoint] =
toInt.until(end.toInt).map(CodePoint(_))
def toCodeUnit(text: String): CodeUnit =
CodeUnit(text.offsetByCodePoints(0, toInt))
}
implicit object CodePoint extends Offset[CodePoint] {
def toInt(p: CodePoint): Int = p.toInt
def count(text: String, start: CodeUnit, end: CodeUnit): CodePoint =
CodePoint(text.codePointCount(start.toInt, end.toInt))
}
/**
* Offsets into the String as if the String were encoded as UTF-8. You
* cannot use a [[Utf8]] offset to index a String, because not all
* Utf8 indices are valid indices into the String.
*/
case class Utf8(toInt: Int) extends AnyVal with Ordered[Utf8] {
def compare(other: Utf8): Int = toInt.compare(other.toInt)
def +(other: Utf8) = Utf8(toInt + other.toInt)
def -(other: Utf8) = Utf8(toInt - other.toInt)
def min(other: Utf8): Utf8 = if (toInt < other.toInt) this else other
def max(other: Utf8): Utf8 = if (toInt > other.toInt) this else other
}
implicit object Utf8 extends Offset[Utf8] {
def toInt(u: Utf8): Int = u.toInt
/**
* Count how many bytes this section of text would be when encoded as
* UTF-8.
*/
def count(s: String, start: CodeUnit, end: CodeUnit): Utf8 = {
def go(i: CodeUnit, byteLength: Utf8): Utf8 =
if (i < end) {
val cp = s.codePointAt(i.toInt)
go(i + CodeUnit(Character.charCount(cp)), byteLength + forCodePoint(cp))
} else {
byteLength
}
go(start, Utf8(0))
}
/**
* Unfortunately, there is no convenient API for finding out how many
* bytes a unicode code point would take in UTF-8, so we have to
* explicitly calculate it.
*
* @see http://en.wikipedia.org/wiki/UTF-8#Description
*/
def forCodePoint(cp: Int): Utf8 =
Utf8 {
// if the code point is an unpaired surrogate, it will be converted
// into a 1 byte replacement character
if (Character.getType(cp) == Character.SURROGATE) 1
else {
cp match {
case _ if cp < 0x80 => 1
case _ if cp < 0x800 => 2
case _ if cp < 0x10000 => 3
case _ => 4
}
}
}
}
/**
* Display units count what we consider a "character" in a
* Tweet. [[DisplayUnit]] offsets are only valid for text that is
* NFC-normalized (See: http://www.unicode.org/reports/tr15) and
* HTML-encoded, though this interface cannot enforce that.
*
* Currently, a [[DisplayUnit]] is equivalent to a single Unicode code
* point combined with treating "&lt;", "&gt;", and "&amp;" each as a
* single character (since they are displayed as '<', '>', and '&'
* respectively). This implementation is not directly exposed.
*
* It should be possible to change this definition without breaking
* code that uses the [[DisplayUnit]] interface e.g. to count
* user-perceived characters (graphemes) rather than code points,
* though any change has to be made in concert with changing the
* mobile client and Web implementations so that the user experience
* of character counting remains consistent.
*/
case class DisplayUnit(toInt: Int) extends AnyVal with Ordered[DisplayUnit] {
def compare(other: DisplayUnit): Int = toInt.compare(other.toInt)
def +(other: DisplayUnit) = DisplayUnit(toInt + other.toInt)
def -(other: DisplayUnit) = DisplayUnit(toInt - other.toInt)
def min(other: DisplayUnit): DisplayUnit = if (toInt < other.toInt) this else other
def max(other: DisplayUnit): DisplayUnit = if (toInt > other.toInt) this else other
}
implicit object DisplayUnit extends Offset[DisplayUnit] {
def toInt(d: DisplayUnit): Int = d.toInt
/**
* Returns the number of display units in the specified range of the
* given text. See [[DisplayUnit]] for a descrption of what we
* consider a display unit.
*
* The input string should already be NFC normalized to get
* consistent results. If partially html encoded, it will correctly
* count html entities as a single display unit.
*
* @param text the string containing the characters to count.
* @param the index to the first char of the text range
* @param the index after the last char of the text range.
*/
def count(text: String, start: CodeUnit, end: CodeUnit): DisplayUnit = {
val stop = end.min(CodeUnit.length(text))
@annotation.tailrec
def go(offset: CodeUnit, total: DisplayUnit): DisplayUnit =
if (offset >= stop) total
else go(offset + at(text, offset), total + DisplayUnit(1))
go(start, DisplayUnit(0))
}
/**
* Return the length of the display unit at the specified offset in
* the (NFC-normalized, HTML-encoded) text.
*/
def at(text: String, offset: CodeUnit): CodeUnit =
CodeUnit {
text.codePointAt(offset.toInt) match {
case '&' =>
if (text.regionMatches(offset.toInt, "&amp;", 0, 5)) 5
else if (text.regionMatches(offset.toInt, "&lt;", 0, 4)) 4
else if (text.regionMatches(offset.toInt, "&gt;", 0, 4)) 4
else 1
case cp => Character.charCount(cp)
}
}
}
/**
* Ranges of offsets, useful for avoiding slicing entities.
*/
sealed trait Ranges[T] {
def contains(t: T): Boolean
}
object Ranges {
private[this] case class Impl[T](toSeq: Seq[(T, T)])(implicit off: Offset[T])
extends Ranges[T] {
def contains(t: T): Boolean = toSeq.exists { case (lo, hi) => off.gt(t, lo) && off.lt(t, hi) }
}
/**
* Non-inclusive range of offsets (matches values that are strictly
* between `hi` and `lo`)
*/
def between[T](lo: T, hi: T)(implicit off: Offset[T]): Ranges[T] =
if (off.toInt(hi) > off.toInt(lo) + 1 && off.toInt(lo) < Int.MaxValue) Impl(Seq((lo, hi)))
else Impl(Nil)
/**
* The union of all of the specified ranges.
*/
def all[T](ranges: Seq[Ranges[T]])(implicit off: Offset[T]): Ranges[T] =
Impl(
// Preprocess the ranges so that each contains check is as cheap
// as possible.
ranges
.flatMap { case r: Impl[T] => r.toSeq }
.sortBy(_._1)
.foldLeft(Nil: List[(T, T)]) {
case ((a, b) :: out, (c, d)) if off.lt(c, b) => (a, d) :: out
case (out, r) => r :: out
}
)
def Empty[T: Offset]: Ranges[T] = Impl[T](Nil)
private[this] val HtmlEscapes = """&(?:amp|lt|gt);""".r
/**
* Match [[CodeUnit]]s that would split a HTML entity.
*/
def htmlEntities(s: String): Ranges[CodeUnit] = {
val it = HtmlEscapes.findAllIn(s)
all(it.map(_ => between(CodeUnit(it.start), CodeUnit(it.end))).toSeq)
}
def fromCodePointPairs(pairs: Seq[(Int, Int)]): Ranges[CodePoint] =
all(pairs.map { case (lo, hi) => between(CodePoint(lo), CodePoint(hi)) })
}
}

View file

@ -0,0 +1,55 @@
package com.twitter.tweetypie.tweettext
/**
* Code used to convert raw user-provided text into an allowable form.
*/
object PartialHtmlEncoding {
/**
* Replaces all `<`, `>`, and '&' chars with "&lt;", "&gt;", and "&amp;", respectively.
*
* Tweet text is HTML-encoded at tweet creation time, and is stored and processed in encoded form.
*/
def encode(text: String): String = {
val buf = new StringBuilder
text.foreach {
case '<' => buf.append("&lt;")
case '>' => buf.append("&gt;")
case '&' => buf.append("&amp;")
case c => buf.append(c)
}
buf.toString
}
private val AmpLtRegex = "&lt;".r
private val AmpGtRegex = "&gt;".r
private val AmpAmpRegex = "&amp;".r
private val partialHtmlDecoder: (String => String) =
((s: String) => AmpLtRegex.replaceAllIn(s, "<"))
.andThen(s => AmpGtRegex.replaceAllIn(s, ">"))
.andThen(s => AmpAmpRegex.replaceAllIn(s, "&"))
/**
* The opposite of encode, it replaces all "&lt;", "&gt;", and "&amp;" with
* `<`, `>`, and '&', respectively.
*/
def decode(text: String): String =
decodeWithModification(text) match {
case Some(mod) => mod.updated
case None => text
}
/**
* Decodes encoded entities, and returns a `TextModification` if the text was modified.
*/
def decodeWithModification(text: String): Option[TextModification] =
TextModification.replaceAll(
text,
AmpLtRegex -> "<",
AmpGtRegex -> ">",
AmpAmpRegex -> "&"
)
}

View file

@ -0,0 +1,251 @@
package com.twitter.tweetypie.tweettext
import scala.util.matching.Regex
/**
* Code used to convert raw user-provided text into an allowable form.
*/
object Preprocessor {
import TweetText._
import TextModification.replaceAll
/**
* Regex for dos-style line endings.
*/
val DosLineEndingRegex: Regex = """\r\n""".r
/**
* Converts \r\n to just \n.
*/
def normalizeNewlines(text: String): String =
DosLineEndingRegex.replaceAllIn(text, "\n")
/**
* Characters to strip out of tweet text at write-time.
*/
val unicodeCharsToStrip: Seq[Char] =
Seq(
'\uFFFE', '\uFEFF', // BOM
'\uFFFF', // Special
'\u200E', '\u200F', // ltr, rtl
'\u202A', '\u202B', '\u202C', '\u202D', '\u202E', // Directional change
'\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008',
'\u0009', '\u000B', '\u000C', '\u000E', '\u000F', '\u0010', '\u0011', '\u0012', '\u0013',
'\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001A', '\u001B', '\u001C',
'\u001D', '\u001E', '\u001F', '\u007F',
'\u2065',
)
val UnicodeCharsToStripRegex: Regex = unicodeCharsToStrip.mkString("[", "", "]").r
/**
* Strips out control characters and other non-textual unicode chars that can break xml and/or
* json rendering, or be used for exploits.
*/
def stripControlCharacters(text: String): String =
UnicodeCharsToStripRegex.replaceAllIn(text, "")
val Tweetypie674UnicodeSequence: String =
"\u0633\u0645\u064e\u0640\u064e\u0651\u0648\u064f\u0648\u064f\u062d\u062e " +
"\u0337\u0334\u0310\u062e \u0337\u0334\u0310\u062e \u0337\u0334\u0310\u062e " +
"\u0627\u0645\u0627\u0631\u062a\u064a\u062e \u0337\u0334\u0310\u062e"
val Tweetypie674UnicodeRegex: Regex = Tweetypie674UnicodeSequence.r
/**
* Replace each `Tweetypie674UnicodeSequence` of this string to REPLACEMENT
* CHARACTER.
*
* Apple has a bug in its CoreText library. This aims to prevent
* ios clients from being crashed when a tweet contains the specific
* unicode sequence.
*/
def avoidCoreTextBug(text: String): String =
Tweetypie674UnicodeRegex.replaceAllIn(text, "\ufffd")
/**
* Replace each `Tweetypie674UnicodeSequence` of this string to a REPLACEMENT
* CHARACTER, returns a TextModification object that provides information
* to also update entity indices.
*/
def replaceCoreTextBugModification(text: String): Option[TextModification] =
replaceAll(text, Tweetypie674UnicodeRegex, "\ufffd")
private val preprocessor: String => String =
((s: String) => nfcNormalize(s))
.andThen(stripControlCharacters _)
.andThen(trimBlankCharacters _)
.andThen(normalizeNewlines _)
.andThen(collapseBlankLines _)
.andThen(avoidCoreTextBug _)
/**
* Performs the text modifications that are necessary in the write-path before extracting URLs.
*/
def preprocessText(text: String): String =
preprocessor(text)
/**
* Replaces all `<`, `>`, and '&' chars with "&lt;", "&gt;", and "&amp;", respectively.
*
* The original purpose of this was presumably to prevent script injections when
* displaying tweets without proper escaping. Currently, tweets are encoded before
* they are stored in the database.
*
* Note that the pre-escaping of & < and > also happens in the rich text editor in javascript
*/
def partialHtmlEncode(text: String): String =
PartialHtmlEncoding.encode(text)
/**
* The opposite of partialHtmlEncode, it replaces all "&lt;", "&gt;", and "&amp;" with
* `<`, `>`, and '&', respectively.
*/
def partialHtmlDecode(text: String): String =
PartialHtmlEncoding.decode(text)
/**
*
* Detects all forms of whitespace, considering as whitespace the following:
* This regex detects characters that always or often are rendered as blank space. We use
* this to prevent users from inserting excess blank lines and from tweeting effectively
* blank tweets.
*
* Note that these are not all semantically "whitespace", so this regex should not be used
* to process non-blank text, e.g. to separate words.
*
* Codepoints below and the `\p{Z}` regex character property alias are defined in the Unicode
* Character Database (UCD) at https://unicode.org/ucd/ and https://unicode.org/reports/tr44/
*
* The `\p{Z}` regex character property alias is defined specifically in UCD as:
*
* Zs | Space_Separator | a space character (of various non-zero widths)
* Zl | Line_Separator | U+2028 LINE SEPARATOR only
* Zp | Paragraph_Separator | U+2029 PARAGRAPH SEPARATOR only
* Z | Separator | Zs | Zl | Zp
* ref: https://unicode.org/reports/tr44/#GC_Values_Table
*
* U+0009 Horizontal Tab (included in \s)
* U+000B Vertical Tab (included in \s)
* U+000C Form feed (included in \s)
* U+000D Carriage return (included in \s)
* U+0020 space (included in \s)
* U+0085 Next line (included in \u0085)
* U+061C arabic letter mark (included in \u061C)
* U+00A0 no-break space (included in \p{Z})
* U+00AD soft-hyphen marker (included in \u00AD)
* U+1680 ogham space mark (included in \p{Z})
* U+180E mongolian vowel separator (included in \p{Z} on jdk8 and included in \u180E on jdk11)
* U+2000 en quad (included in \p{Z})
* U+2001 em quad (included in \p{Z})
* U+2002 en space (included in \p{Z})
* U+2003 em space (included in \p{Z})
* U+2004 three-per-em space (included in \p{Z})
* U+2005 four-per-em space (included in \p{Z})
* U+2006 six-per-em space (included in \p{Z})
* U+2007 figure space (included in \p{Z})
* U+2008 punctuation space (included in \p{Z})
* U+2009 thin space (included in \p{Z})
* U+200A hair space (included in \p{Z})
* U+200B zero-width (included in \u200B-\u200D)
* U+200C zero-width non-joiner (included in \u200B-\u200D)
* U+200D zero-width joiner (included in \u200B-\u200D)
* U+2028 line separator (included in \p{Z})
* U+2029 paragraph separator (included in \p{Z})
* U+202F narrow no-break space (included in \p{Z})
* U+205F medium mathematical space (included in \p{Z})
* U+2061 function application (included in \u2061-\u2064)
* U+2062 invisible times (included in \u2061-\u2064)
* U+2063 invisible separator (included in \u2061-\u2064)
* U+2064 invisible plus (included in \u2061-\u2064)
* U+2066 left-to-right isolate (included in \u2066-\u2069)
* U+2067 right-to-left isolate (included in \u2066-\u2069)
* U+2068 first strong isolate (included in \u2066-\u2069)
* U+2069 pop directional isolate (included in \u2066-\u2069)
* U+206A inhibit symmetric swapping (included in \u206A-\u206F)
* U+206B activate symmetric swapping (included in \u206A-\u206F)
* U+206C inhibit arabic form shaping (included in \u206A-\u206F)
* U+206D activate arabic form shaping (included in \u206A-\u206F)
* U+206E national digit shapes (included in \u206A-\u206F)
* U+206F nominal digit shapes (included in \u206A-\u206F)
* U+2800 braille pattern blank (included in \u2800)
* U+3164 hongul filler (see UCD Ignorable_Code_Point)
* U+FFA0 halfwidth hongul filler (see UCD Ignorable_Code_Point)
* U+3000 ideographic space (included in \p{Z})
* U+FEFF zero-width no-break space (explicitly included in \uFEFF)
*/
val BlankTextRegex: Regex =
"""[\s\p{Z}\u180E\u0085\u00AD\u061C\u200B-\u200D\u2061-\u2064\u2066-\u2069\u206A-\u206F\u2800\u3164\uFEFF\uFFA0]*""".r
/**
* Some of the above blank characters are valid at the start of a Tweet (and irrelevant at the end)
* such as characters that change the direction of text. When trimming from the start
* or end of text we use a smaller set of characters
*/
val BlankWhenLeadingOrTrailingRegex: Regex = """[\s\p{Z}\u180E\u0085\u200B\uFEFF]*""".r
/**
* Matches consecutive blanks, starting at a newline.
*/
val ConsecutiveBlankLinesRegex: Regex = ("""\n(""" + BlankTextRegex + """\n){2,}""").r
val LeadingBlankCharactersRegex: Regex = ("^" + BlankWhenLeadingOrTrailingRegex).r
val TrailingBlankCharactersRegex: Regex = (BlankWhenLeadingOrTrailingRegex + "$").r
/**
* Is the given text empty or contains nothing but whitespace?
*/
def isBlank(text: String): Boolean =
BlankTextRegex.pattern.matcher(text).matches()
/**
* See http://confluence.local.twitter.com/display/PROD/Displaying+line+breaks+in+Tweets
*
* Collapses consecutive blanks lines down to a single blank line. We can assume that
* all newlines have already been normalized to just \n, so we don't have to worry about
* \r\n.
*/
def collapseBlankLinesModification(text: String): Option[TextModification] =
replaceAll(text, ConsecutiveBlankLinesRegex, "\n\n")
def collapseBlankLines(text: String): String =
ConsecutiveBlankLinesRegex.replaceAllIn(text, "\n\n")
def trimBlankCharacters(text: String): String =
TrailingBlankCharactersRegex.replaceFirstIn(
LeadingBlankCharactersRegex.replaceFirstIn(text, ""),
""
)
/** Characters that are not visible on their own. Some of these are used in combination with
* other visible characters, and therefore cannot be always stripped from tweets.
*/
private[tweettext] val InvisibleCharacters: Seq[Char] =
Seq(
'\u2060', '\u2061', '\u2062', '\u2063', '\u2064', '\u206A', '\u206B', '\u206C', '\u206D',
'\u206D', '\u206E', '\u206F', '\u200C',
'\u200D', // non-printing chars with valid use in Arabic
'\u2009', '\u200A', '\u200B', // include very skinny spaces too
'\ufe00', '\ufe01', '\ufe02', '\ufe03', '\ufe04', '\ufe05', '\ufe06', '\ufe07', '\ufe08',
'\ufe09', '\ufe0A', '\ufe0B', '\ufe0C', '\ufe0D', '\ufe0E', '\ufe0F',
)
private[tweetypie] val InvisibleUnicodePattern: Regex =
("^[" + InvisibleCharacters.mkString + "]+$").r
def isInvisibleChar(input: Char): Boolean = {
InvisibleCharacters contains input
}
/** If string is only "invisible characters", replace full string with whitespace.
* The purpose of this method is to remove invisible characters when ONLY invisible characters
* appear between two urls, which can be a security vulnerability due to misleading behavior. These
* characters cannot be removed as a rule applied to the tweet, because they are used in
* conjuction with other characters.
*/
def replaceInvisiblesWithWhitespace(text: String): String = {
text match {
case invisible @ InvisibleUnicodePattern() => " " * TweetText.codePointLength(invisible)
case other => other
}
}
}

View file

@ -0,0 +1,24 @@
package com.twitter.tweetypie.tweettext
/**
* A type class for entities found within a piece of tweet text.
*/
trait TextEntity[T] {
def fromIndex(entity: T): Short
def toIndex(entity: T): Short
def move(entity: T, fromIndex: Short, toIndex: Short): T
}
object TextEntity {
def fromIndex[T: TextEntity](entity: T): Short =
implicitly[TextEntity[T]].fromIndex(entity)
def toIndex[T: TextEntity](entity: T): Short =
implicitly[TextEntity[T]].toIndex(entity)
def move[T: TextEntity](entity: T, fromIndex: Short, toIndex: Short): T =
implicitly[TextEntity[T]].move(entity, fromIndex, toIndex)
def shift[T: TextEntity](entity: T, offset: Short): T =
move(entity, (fromIndex(entity) + offset).toShort, (toIndex(entity) + offset).toShort)
}

View file

@ -0,0 +1,232 @@
package com.twitter.tweetypie.tweettext
import scala.util.matching.Regex
object TextModification {
/**
* Lift a text into a TextModification where `original` and `updated` text are the same
* and `replacements` is empty.
*/
def identity(text: String): TextModification =
TextModification(original = text, updated = text, replacements = Nil)
/**
* Replace each substring that matches the regex with the substitution string, returns a
* TextModification object that contains the updated text and enough information to also
* update entity indices.
*
* This method should correctly be taking into account surrogate-pairs. The returned
* TextModification object has code-point offsets, instead of code-unit offsets.
*/
def replaceAll(text: String, regex: Regex, substitution: String): Option[TextModification] =
replaceAll(text, regex -> substitution)
/**
* Replaces substrings that match the given `Regex` with the corresonding substitution
* string. Returns a `TextModification` that can be used to reindex entities.
*/
def replaceAll(
text: String,
regexAndSubstitutions: (Regex, String)*
): Option[TextModification] = {
val matches =
(for {
(r, s) <- regexAndSubstitutions
m <- r.findAllIn(text).matchData
} yield (m, s)).sortBy { case (m, _) => m.start }
if (matches.isEmpty) {
// no match found, return None to indicate no modifications made
None
} else {
val replacements = List.newBuilder[TextReplacement]
val indexConverter = new IndexConverter(text)
// contains the retained text, built up as we walk through the regex matches
val buf = new StringBuilder(text.length)
// the number of code-points copied into buf
var codePointsCopied = Offset.CodePoint(0)
// always holds the start code-unit offset to copy to buf when we encounter
// either a regex match or end-of-string.
var anchor = 0
import indexConverter.toCodePoints
for ((m, sub) <- matches) {
val unchangedText = text.substring(anchor, m.start)
val unchangedLen = Offset.CodePoint.length(unchangedText)
val subLen = Offset.CodePoint.length(sub)
// copies the text upto the regex match run, plus the replacement string
buf.append(unchangedText).append(sub)
codePointsCopied += unchangedLen + subLen
// the offsets indicate the indices of the matched string in the original
// text, and the indices of the replacement string in the updated string
replacements +=
TextReplacement(
originalFrom = toCodePoints(Offset.CodeUnit(m.start)),
originalTo = toCodePoints(Offset.CodeUnit(m.end)),
updatedFrom = codePointsCopied - subLen,
updatedTo = codePointsCopied
)
anchor = m.end
}
buf.append(text.substring(anchor))
Some(TextModification(text, buf.toString, replacements.result()))
}
}
/**
* Inserts a string at a specified code point offset.
* Returns a `TextModification` that can be used to reindex entities.
*/
def insertAt(
originalText: String,
insertAt: Offset.CodePoint,
textToInsert: String
): TextModification = {
val insertAtCodeUnit = insertAt.toCodeUnit(originalText).toInt
val (before, after) = originalText.splitAt(insertAtCodeUnit)
val updatedText = s"$before$textToInsert$after"
val textToInsertLength = TweetText.codePointLength(textToInsert)
TextModification(
original = originalText,
updated = updatedText,
replacements = List(
TextReplacement.fromCodePoints(
originalFrom = insertAt.toInt,
originalTo = insertAt.toInt,
updatedFrom = insertAt.toInt,
updatedTo = insertAt.toInt + textToInsertLength
))
)
}
}
/**
* Encodes information about insertions/deletions/replacements made to a string, providing
* the original string, the updated string, and a list of TextReplacement objects
* that encode the indices of the segments that were changed. Using this information,
* it is possible to map an offset into the original string to an offset into the updated
* string, assuming the text at the offset was not within one of the modified segments.
*
* All offsets are code-points, not UTF6 code-units.
*/
case class TextModification(
original: String,
updated: String,
replacements: List[TextReplacement]) {
private val originalLen = Offset.CodePoint.length(original)
/**
* Using an offset into the original String, computes the equivalent offset into the updated
* string. If the offset falls within a segment that was removed/replaced, None is returned.
*/
def reindex(index: Offset.CodePoint): Option[Offset.CodePoint] =
reindex(index, Offset.CodePoint(0), replacements)
/**
* Reindexes an entity of type T. Returns the updated entity, or None if either the `fromIndex`
* or `toIndex` value is now out of range.
*/
def reindexEntity[T: TextEntity](e: T): Option[T] =
for {
from <- reindex(Offset.CodePoint(TextEntity.fromIndex(e)))
to <- reindex(Offset.CodePoint(TextEntity.toIndex(e) - 1))
} yield TextEntity.move(e, from.toShort, (to.toShort + 1).toShort)
/**
* Reindexes a sequence of entities of type T. Some entities could be filtered
* out if they span a region of text that has been removed.
*/
def reindexEntities[T: TextEntity](es: Seq[T]): Seq[T] =
for (e <- es; e2 <- reindexEntity(e)) yield e2
/**
* Swaps `original` and `updated` text and inverts all `TextReplacement` instances.
*/
def inverse: TextModification =
TextModification(updated, original, replacements.map(_.inverse))
// recursively walks through the list of TextReplacement objects computing
// offsets to add/substract from 'shift', which accumulates all changes and
// then gets added to index at the end.
private def reindex(
index: Offset.CodePoint,
shift: Offset.CodePoint,
reps: List[TextReplacement]
): Option[Offset.CodePoint] =
reps match {
case Nil =>
if (index.toInt >= 0 && index <= originalLen)
Some(index + shift)
else
None
case (r @ TextReplacement(fr, to, _, _)) :: tail =>
if (index < fr) Some(index + shift)
else if (index < to) None
else reindex(index, shift + r.lengthDelta, tail)
}
}
object TextReplacement {
def fromCodePoints(
originalFrom: Int,
originalTo: Int,
updatedFrom: Int,
updatedTo: Int
): TextReplacement =
TextReplacement(
Offset.CodePoint(originalFrom),
Offset.CodePoint(originalTo),
Offset.CodePoint(updatedFrom),
Offset.CodePoint(updatedTo)
)
}
/**
* Encodes the indices of a segment of text in one string that maps to a replacement
* segment in an updated version of the text. The replacement segment could be empty
* (updatedTo == updatedFrom), indicating the segment was removed.
*
* All offsets are code-points, not UTF16 code-units.
*
* `originalFrom` and `updatedFrom` are inclusive.
* `originalTo` and `updatedTo` are exclusive.
*/
case class TextReplacement(
originalFrom: Offset.CodePoint,
originalTo: Offset.CodePoint,
updatedFrom: Offset.CodePoint,
updatedTo: Offset.CodePoint) {
def originalLength: Offset.CodePoint = originalTo - originalFrom
def updatedLength: Offset.CodePoint = updatedTo - updatedFrom
def lengthDelta: Offset.CodePoint = updatedLength - originalLength
def shiftOriginal(offset: Offset.CodePoint): TextReplacement =
copy(originalFrom = originalFrom + offset, originalTo = originalTo + offset)
def shiftUpdated(offset: Offset.CodePoint): TextReplacement =
copy(updatedFrom = updatedFrom + offset, updatedTo = updatedTo + offset)
def shift(offset: Offset.CodePoint): TextReplacement =
TextReplacement(
originalFrom + offset,
originalTo + offset,
updatedFrom + offset,
updatedTo + offset
)
def inverse: TextReplacement =
TextReplacement(
originalFrom = updatedFrom,
originalTo = updatedTo,
updatedFrom = originalFrom,
updatedTo = originalTo
)
}

View file

@ -0,0 +1,159 @@
package com.twitter.tweetypie.tweettext
import com.twitter.tweetypie.tweettext.TweetText._
import com.twitter.twittertext.Extractor
import java.lang.Character
import scala.annotation.tailrec
import scala.collection.JavaConverters._
object Truncator {
val Ellipsis = "\u2026"
/**
* Truncate tweet text for a retweet. If the text is longer than
* either of the length limits, code points are cut off from the end
* of the text and replaced with an ellipsis. We keep as much of the
* leading text as possible, subject to these constraints:
*
* - There are no more than `MaxDisplayLength` characters.
*
* - When converted to UTF-8, the result does not exceed `MaxByteLength`.
*
* - We do not break within a single grapheme cluster.
*
* The input is assumed to be partial HTML-encoded and may or may
* not be NFC normalized. The result will be partial HTML-encoded
* and will be NFC normalized.
*/
def truncateForRetweet(input: String): String = truncateWithEllipsis(input, Ellipsis)
/**
* Truncate to [[com.twitter.tweetypie.tweettext.TweetText#OrginalMaxDisplayLength]] display
* units, using "..." as an ellipsis. The resulting text is guaranteed to pass our tweet length
* check, but it is not guaranteed to fit in a SMS message.
*/
def truncateForSms(input: String): String = truncateWithEllipsis(input, "...")
/**
* Check the length of the given text, and truncate it if it is longer
* than the allowed length for a Tweet. The result of this method will
* always have:
*
* - Display length <= OriginalMaxDisplayLength.
* - Length when encoded as UTF-8 <= OriginalMaxUtf8Length.
*
* If the input would violate this, then the text will be
* truncated. When the text is truncated, it will be truncated such
* that:
*
* - Grapheme clusters will not be split.
* - The last character before the ellipsis will not be a whitespace
* character.
* - The ellipsis text will be appended to the end.
*/
private[this] def truncateWithEllipsis(input: String, ellipsis: String): String = {
val text = nfcNormalize(input)
val truncateAt =
truncationPoint(text, OriginalMaxDisplayLength, OriginalMaxUtf8Length, Some(ellipsis))
if (truncateAt.codeUnitOffset.toInt == text.length) text
else text.take(truncateAt.codeUnitOffset.toInt) + ellipsis
}
/**
* Indicates a potential TruncationPoint in piece of text.
*
* @param charOffset the utf-16 character offset of the truncation point
* @param codePointOffset the offset in code points
*/
case class TruncationPoint(codeUnitOffset: Offset.CodeUnit, codePointOffset: Offset.CodePoint)
/**
* Computes a TruncationPoint for the given text and length constraints. If `truncated` on
* the result is `false`, it means the text will fit within the given constraints without
* truncation. Otherwise, the result indicates both the character and code-point offsets
* at which to perform the truncation, and the resulting display length and byte length of
* the truncated string.
*
* Text should be NFC normalized first for best results.
*
* @param withEllipsis if true, then the truncation point will be computed so that there is space
* to append an ellipsis and to still remain within the limits. The ellipsis is not counted
* in the returned display and byte lengths.
*
* @param atomicUnits may contain a list of ranges that should be treated as atomic unit and
* not split. each tuple is half-open range in code points.
*/
def truncationPoint(
text: String,
maxDisplayLength: Int = OriginalMaxDisplayLength,
maxByteLength: Int = OriginalMaxUtf8Length,
withEllipsis: Option[String] = None,
atomicUnits: Offset.Ranges[Offset.CodePoint] = Offset.Ranges.Empty
): TruncationPoint = {
val breakPoints =
GraphemeIndexIterator
.ends(text)
.filterNot(Offset.Ranges.htmlEntities(text).contains)
val ellipsisDisplayUnits =
withEllipsis.map(Offset.DisplayUnit.length).getOrElse(Offset.DisplayUnit(0))
val maxTruncatedDisplayLength = Offset.DisplayUnit(maxDisplayLength) - ellipsisDisplayUnits
val ellipsisByteLength = withEllipsis.map(Offset.Utf8.length).getOrElse(Offset.Utf8(0))
val maxTruncatedByteLength = Offset.Utf8(maxByteLength) - ellipsisByteLength
var codeUnit = Offset.CodeUnit(0)
var codePoint = Offset.CodePoint(0)
var displayLength = Offset.DisplayUnit(0)
var byteLength = Offset.Utf8(0)
var truncateCodeUnit = codeUnit
var truncateCodePoint = codePoint
@tailrec def go(): TruncationPoint =
if (displayLength.toInt > maxDisplayLength || byteLength.toInt > maxByteLength) {
TruncationPoint(truncateCodeUnit, truncateCodePoint)
} else if (codeUnit != truncateCodeUnit &&
displayLength <= maxTruncatedDisplayLength &&
byteLength <= maxTruncatedByteLength &&
(codeUnit.toInt == 0 || !Character.isWhitespace(text.codePointBefore(codeUnit.toInt))) &&
!atomicUnits.contains(codePoint)) {
// we can advance the truncation point
truncateCodeUnit = codeUnit
truncateCodePoint = codePoint
go()
} else if (breakPoints.hasNext) {
// there are further truncation points to consider
val nextCodeUnit = breakPoints.next
codePoint += Offset.CodePoint.count(text, codeUnit, nextCodeUnit)
displayLength += Offset.DisplayUnit.count(text, codeUnit, nextCodeUnit)
byteLength += Offset.Utf8.count(text, codeUnit, nextCodeUnit)
codeUnit = nextCodeUnit
go()
} else {
TruncationPoint(codeUnit, codePoint)
}
go()
}
/**
* Truncate the given text, avoiding chopping HTML entities and tweet
* entities. This should only be used for testing because it performs
* entity extraction, and so is very inefficient.
*/
def truncateForTests(
input: String,
maxDisplayLength: Int = OriginalMaxDisplayLength,
maxByteLength: Int = OriginalMaxUtf8Length
): String = {
val text = nfcNormalize(input)
val extractor = new Extractor
val entities = extractor.extractEntitiesWithIndices(text)
extractor.modifyIndicesFromUTF16ToUnicode(text, entities)
val avoid = Offset.Ranges.fromCodePointPairs(
entities.asScala.map(e => (e.getStart().intValue, e.getEnd().intValue))
)
val truncateAt = truncationPoint(text, maxDisplayLength, maxByteLength, None, avoid)
text.take(truncateAt.codeUnitOffset.toInt)
}
}

View file

@ -0,0 +1,62 @@
package com.twitter.tweetypie.tweettext
import java.text.Normalizer
object TweetText {
/** The original maximum tweet length, taking into account normalization */
private[tweetypie] val OriginalMaxDisplayLength = 140
/** Maximum number of visible code points allowed in a tweet when tweet length is counted by code
* points, taking into account normalization. See also [[MaxVisibleWeightedEmojiLength]].
*/
private[tweetypie] val MaxVisibleWeightedLength = 280
/** Maximum number of visible code points allowed in a tweet when tweet length is counted by
* emoji, taking into account normalization. See also [[MaxVisibleWeightedLength]].
* 140 is the max number of Emojis, visible, fully-weighted per Twitter's cramming rules
* 10 is the max number of Code Points per Emoji
*/
private[tweetypie] val MaxVisibleWeightedEmojiLength = 140 * 10
/** Maximum number of bytes when truncating tweet text for a retweet. Originally was the
* max UTF-8 length when tweets were at most 140 characters.
* See also [[OriginalMaxDisplayLength]].
*/
private[tweetypie] val OriginalMaxUtf8Length = 600
/** Maximum number of bytes for tweet text using utf-8 encoding.
*/
private[tweetypie] val MaxUtf8Length = 5708
/** Maximum number of mentions allowed in tweet text. This is enforced at tweet creation time */
private[tweetypie] val MaxMentions = 50
/** Maximum number of urls allowed in tweet text. This is enforced at tweet creation time */
private[tweetypie] val MaxUrls = 10
/** Maximum number of hashtags allowed in tweet text. This is enforced at tweet creation time */
private[tweetypie] val MaxHashtags = 50
/** Maximum number of cashtags allowed in tweet text. This is enforced at tweet creation time */
private[tweetypie] val MaxCashtags = 50
/** Maximum length of a hashtag (not including the '#') */
private[tweetypie] val MaxHashtagLength = 100
/**
* Normalizes the text according to the unicode NFC spec.
*/
def nfcNormalize(text: String): String = Normalizer.normalize(text, Normalizer.Form.NFC)
/**
* Return the number of "characters" in this text. See
* [[Offset.DisplayUnit]].
*/
def displayLength(text: String): Int = Offset.DisplayUnit.length(text).toInt
/**
* Return the number of Unicode code points in this String.
*/
def codePointLength(text: String): Int = Offset.CodePoint.length(text).toInt
}

View file

@ -0,0 +1,76 @@
scala_library(
sources = ["*.scala"],
compiler_option_sets = ["fatal_warnings"],
platform = "java8",
provides = scala_artifact(
org = "com.twitter.tweetypie",
name = "util",
repo = artifactory,
),
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"//:scala-reflect",
"3rdparty/jvm/commons-codec",
"3rdparty/jvm/org/apache/thrift:libthrift",
"finagle/finagle-core/src/main",
"mediaservices/commons/src/main/thrift:thrift-scala",
"scrooge/scrooge-serializer/src/main/scala",
"tweetypie/servo/repo",
"tweetypie/servo/util",
"tweetypie/servo/util/src/main/scala:exception",
"src/scala/com/twitter/takedown/util",
"src/thrift/com/twitter/dataproducts:enrichments_profilegeo-scala",
"src/thrift/com/twitter/escherbird:media-annotation-structs-scala",
"src/thrift/com/twitter/expandodo:cards-scala",
"src/thrift/com/twitter/gizmoduck:thrift-scala",
"src/thrift/com/twitter/servo:servo-exception-scala",
"src/thrift/com/twitter/spam/rtf:safety-label-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:deprecated-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:service-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:transient_context-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:tweet-scala",
"stitch/stitch-core",
"tweet-util",
"util/util-core:scala",
],
)
scala_library(
name = "EditControlUtil",
sources = [
"EditControlUtil.scala",
"package.scala",
],
compiler_option_sets = ["fatal_warnings"],
platform = "java8",
provides = scala_artifact(
org = "com.twitter.tweetypie",
name = "util-EditControlUtil",
repo = artifactory,
),
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"//:scala-reflect",
"3rdparty/jvm/commons-codec",
"3rdparty/jvm/org/apache/thrift:libthrift",
"finagle/finagle-core/src/main",
"mediaservices/commons/src/main/thrift:thrift-scala",
"scrooge/scrooge-serializer/src/main/scala",
"tweetypie/servo/util/src/main/scala:exception",
"src/thrift/com/twitter/dataproducts:enrichments_profilegeo-scala",
"src/thrift/com/twitter/escherbird:media-annotation-structs-scala",
"src/thrift/com/twitter/expandodo:cards-scala",
"src/thrift/com/twitter/gizmoduck:thrift-scala",
"src/thrift/com/twitter/servo:servo-exception-scala",
"src/thrift/com/twitter/spam/rtf:safety-label-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:deprecated-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:service-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:transient_context-scala",
"tweetypie/common/src/thrift/com/twitter/tweetypie:tweet-scala",
"stitch/stitch-core",
"tweet-util",
"util/util-core:scala",
],
)

View file

@ -0,0 +1,29 @@
package com.twitter.tweetypie.util
import com.twitter.escherbird.thriftscala.TweetEntityAnnotation
import com.twitter.tweetypie.thriftscala.EscherbirdEntityAnnotations
import com.twitter.tweetypie.thriftscala.Tweet
object CommunityAnnotation {
val groupId: Long = 8
val domainId: Long = 31
def apply(communityId: Long): TweetEntityAnnotation =
TweetEntityAnnotation(groupId, domainId, entityId = communityId)
def unapply(annotation: TweetEntityAnnotation): Option[Long] =
annotation match {
case TweetEntityAnnotation(`groupId`, `domainId`, entityId) => Some(entityId)
case _ => None
}
// Returns None instead of Some(Seq()) when there are non-community annotations present
def additionalFieldsToCommunityIDs(additionalFields: Tweet): Option[Seq[Long]] = {
additionalFields.escherbirdEntityAnnotations
.map {
case EscherbirdEntityAnnotations(entityAnnotations) =>
entityAnnotations.flatMap(CommunityAnnotation.unapply)
}.filter(_.nonEmpty)
}
}

View file

@ -0,0 +1,19 @@
package com.twitter.tweetypie.util
import com.twitter.tweetypie.thriftscala.Communities
object CommunityUtil {
def communityIds(maybeCommunities: Option[Communities]): Seq[Long] = {
maybeCommunities match {
case None =>
Nil
case Some(Communities(seq)) =>
seq
}
}
def hasCommunity(maybeCommunities: Option[Communities]): Boolean = {
maybeCommunities.exists(_.communityIds.nonEmpty)
}
}

View file

@ -0,0 +1,112 @@
package com.twitter.tweetypie
package util
import com.twitter.tweetypie.thriftscala._
object ConversationControls {
object Create {
def byInvitation(
inviteViaMention: Option[Boolean] = None
): TweetCreateConversationControl.ByInvitation = TweetCreateConversationControl.ByInvitation(
TweetCreateConversationControlByInvitation(inviteViaMention = inviteViaMention)
)
def community(
inviteViaMention: Option[Boolean] = None
): TweetCreateConversationControl.Community = TweetCreateConversationControl.Community(
TweetCreateConversationControlCommunity(inviteViaMention = inviteViaMention)
)
def followers(
inviteViaMention: Option[Boolean] = None
): TweetCreateConversationControl.Followers = TweetCreateConversationControl.Followers(
TweetCreateConversationControlFollowers(inviteViaMention = inviteViaMention)
)
}
object Scenario {
case class CommonScenario(
createConversationControl: TweetCreateConversationControl,
descriptionSuffix: String,
expectedConversationControl: (UserId, Seq[UserId]) => ConversationControl,
inviteViaMention: Option[Boolean])
def mkCommunityScenario(inviteViaMention: Option[Boolean]): CommonScenario =
CommonScenario(
Create.community(inviteViaMention = inviteViaMention),
"community",
expectedConversationControl = (authorId, userIds) => {
community(userIds, authorId, inviteViaMention)
},
inviteViaMention
)
def mkByInvitationScenario(inviteViaMention: Option[Boolean]): CommonScenario =
CommonScenario(
Create.byInvitation(inviteViaMention = inviteViaMention),
"invited users",
expectedConversationControl = (authorId, userIds) => {
byInvitation(userIds, authorId, inviteViaMention)
},
inviteViaMention
)
def mkFollowersScenario(inviteViaMention: Option[Boolean]): CommonScenario =
CommonScenario(
Create.followers(inviteViaMention = inviteViaMention),
"followers",
expectedConversationControl = (authorId, userIds) => {
followers(userIds, authorId, inviteViaMention)
},
inviteViaMention
)
val communityScenario = mkCommunityScenario(None)
val communityInviteViaMentionScenario = mkCommunityScenario(Some(true))
val byInvitationScenario = mkByInvitationScenario(None)
val byInvitationInviteViaMentionScenario = mkByInvitationScenario(Some(true))
val followersScenario = mkFollowersScenario(None)
val followersInviteViaMentionScenario = mkFollowersScenario(Some(true))
}
def byInvitation(
invitedUserIds: Seq[UserId],
conversationTweetAuthorId: UserId,
inviteViaMention: Option[Boolean] = None
): ConversationControl =
ConversationControl.ByInvitation(
ConversationControlByInvitation(
conversationTweetAuthorId = conversationTweetAuthorId,
invitedUserIds = invitedUserIds,
inviteViaMention = inviteViaMention
)
)
def community(
invitedUserIds: Seq[UserId],
conversationTweetAuthorId: UserId,
inviteViaMention: Option[Boolean] = None
): ConversationControl =
ConversationControl.Community(
ConversationControlCommunity(
conversationTweetAuthorId = conversationTweetAuthorId,
invitedUserIds = invitedUserIds,
inviteViaMention = inviteViaMention
)
)
def followers(
invitedUserIds: Seq[UserId],
conversationTweetAuthorId: UserId,
inviteViaMention: Option[Boolean] = None
): ConversationControl =
ConversationControl.Followers(
ConversationControlFollowers(
conversationTweetAuthorId = conversationTweetAuthorId,
invitedUserIds = invitedUserIds,
inviteViaMention = inviteViaMention
)
)
}

View file

@ -0,0 +1,174 @@
package com.twitter.tweetypie.util
import com.twitter.servo.util.Gate
import com.twitter.tweetypie.util.TweetEditFailure.TweetEditInvalidEditControlException
import com.twitter.tweetypie.util.TweetEditFailure.TweetEditUpdateEditControlException
import com.twitter.tweetypie.thriftscala.EditControl
import com.twitter.tweetypie.thriftscala.EditControlEdit
import com.twitter.tweetypie.thriftscala.EditControlInitial
import com.twitter.tweetypie.thriftscala.Tweet
import com.twitter.util.Try
import com.twitter.util.Return
import com.twitter.util.Throw
import com.twitter.util.Time
import com.twitter.util.Duration
object EditControlUtil {
val maxTweetEditsAllowed = 5
val oldEditTimeWindow = Duration.fromMinutes(30)
val editTimeWindow = Duration.fromMinutes(60)
def editControlEdit(
initialTweetId: TweetId,
editControlInitial: Option[EditControlInitial] = None
): EditControl.Edit =
EditControl.Edit(
EditControlEdit(initialTweetId = initialTweetId, editControlInitial = editControlInitial))
// EditControl for the tweet that is not an edit, that is, any regular tweet we create
// that can, potentially, be edited later.
def makeEditControlInitial(
tweetId: TweetId,
createdAt: Time,
setEditWindowToSixtyMinutes: Gate[Unit] = Gate(_ => false)
): EditControl.Initial = {
val editWindow = if (setEditWindowToSixtyMinutes()) editTimeWindow else oldEditTimeWindow
val initial = EditControlInitial(
editTweetIds = Seq(tweetId),
editableUntilMsecs = Some(createdAt.plus(editWindow).inMilliseconds),
editsRemaining = Some(maxTweetEditsAllowed),
isEditEligible = defaultIsEditEligible,
)
EditControl.Initial(initial)
}
// Returns if a given latestTweetId is the latest edit in the EditControl
def isLatestEdit(
tweetEditControl: Option[EditControl],
latestTweetId: TweetId
): Try[Boolean] = {
tweetEditControl match {
case Some(EditControl.Initial(initial)) =>
isLatestEditFromEditControlInitial(Some(initial), latestTweetId)
case Some(EditControl.Edit(edit)) =>
isLatestEditFromEditControlInitial(
edit.editControlInitial,
latestTweetId
)
case _ => Throw(TweetEditInvalidEditControlException)
}
}
// Returns if a given latestTweetId is the latest edit in the EditControlInitial
private def isLatestEditFromEditControlInitial(
initialTweetEditControl: Option[EditControlInitial],
latestTweetId: TweetId
): Try[Boolean] = {
initialTweetEditControl match {
case Some(initial) =>
Return(latestTweetId == initial.editTweetIds.last)
case _ => Throw(TweetEditInvalidEditControlException)
}
}
/* Create an updated edit control for an initialTweet given the id of the new edit */
def editControlForInitialTweet(
initialTweet: Tweet,
newEditId: TweetId
): Try[EditControl.Initial] = {
initialTweet.editControl match {
case Some(EditControl.Initial(initial)) =>
Return(EditControl.Initial(plusEdit(initial, newEditId)))
case Some(EditControl.Edit(_)) => Throw(TweetEditUpdateEditControlException)
case _ =>
initialTweet.coreData match {
case Some(coreData) =>
Return(
makeEditControlInitial(
tweetId = initialTweet.id,
createdAt = Time.fromMilliseconds(coreData.createdAtSecs * 1000),
setEditWindowToSixtyMinutes = Gate(_ => true)
)
)
case None => Throw(new Exception("Tweet Missing Required CoreData"))
}
}
}
def updateEditControl(tweet: Tweet, newEditId: TweetId): Try[Tweet] =
editControlForInitialTweet(tweet, newEditId).map { editControl =>
tweet.copy(editControl = Some(editControl))
}
def plusEdit(initial: EditControlInitial, newEditId: TweetId): EditControlInitial = {
val newEditTweetIds = (initial.editTweetIds :+ newEditId).distinct.sorted
val editsCount = newEditTweetIds.size - 1 // as there is the original tweet ID there too.
initial.copy(
editTweetIds = newEditTweetIds,
editsRemaining = Some(maxTweetEditsAllowed - editsCount),
)
}
// The ID of the initial Tweet if this is an edit
def getInitialTweetIdIfEdit(tweet: Tweet): Option[TweetId] = tweet.editControl match {
case Some(EditControl.Edit(edit)) => Some(edit.initialTweetId)
case _ => None
}
// If this is the first tweet in an edit chain, return the same tweet id
// otherwise return the result of getInitialTweetId
def getInitialTweetId(tweet: Tweet): TweetId =
getInitialTweetIdIfEdit(tweet).getOrElse(tweet.id)
def isInitialTweet(tweet: Tweet): Boolean =
getInitialTweetId(tweet) == tweet.id
// Extracted just so that we can easily track where the values of isEditEligible is coming from.
private def defaultIsEditEligible: Option[Boolean] = Some(true)
// returns true if it's an edit of a Tweet or an initial Tweet that's been edited
def isEditTweet(tweet: Tweet): Boolean =
tweet.editControl match {
case Some(eci: EditControl.Initial) if eci.initial.editTweetIds.size <= 1 => false
case Some(_: EditControl.Initial) | Some(_: EditControl.Edit) | Some(
EditControl.UnknownUnionField(_)) =>
true
case None => false
}
// returns true if editControl is from an edit of a Tweet
// returns false for any other state, including edit intial.
def isEditControlEdit(editControl: EditControl): Boolean = {
editControl match {
case _: EditControl.Edit | EditControl.UnknownUnionField(_) => true
case _ => false
}
}
def getEditTweetIds(editControl: Option[EditControl]): Try[Seq[TweetId]] = {
editControl match {
case Some(EditControl.Edit(EditControlEdit(_, Some(eci)))) =>
Return(eci.editTweetIds)
case Some(EditControl.Initial(initial)) =>
Return(initial.editTweetIds)
case _ =>
Throw(new Exception(s"EditControlInitial not found in $editControl"))
}
}
}
object TweetEditFailure {
abstract class TweetEditException(msg: String) extends Exception(msg)
case object TweetEditGetInitialEditControlException
extends TweetEditException("Initial EditControl not found")
case object TweetEditInvalidEditControlException
extends TweetEditException("Invalid EditControl for initial_tweet")
case object TweetEditUpdateEditControlException
extends TweetEditException("Invalid Edit Control Update")
}

View file

@ -0,0 +1,45 @@
package com.twitter.tweetypie.util
import com.twitter.finagle.Backoff
import com.twitter.finagle.service.RetryPolicy
import com.twitter.finagle.service.RetryPolicy.RetryableWriteException
import com.twitter.servo.exception.thriftscala.ServerError
import com.twitter.util.Duration
import com.twitter.util.Throw
import com.twitter.util.TimeoutException
import com.twitter.util.Try
object RetryPolicyBuilder {
/**
* Retry on any exception.
*/
def anyFailure[A](backoffs: Stream[Duration]): RetryPolicy[Try[A]] =
RetryPolicy.backoff[Try[A]](Backoff.fromStream(backoffs)) {
case Throw(_) => true
}
/**
* Retry on com.twitter.util.TimeoutException
*/
def timeouts[A](backoffs: Stream[Duration]): RetryPolicy[Try[A]] =
RetryPolicy.backoff[Try[A]](Backoff.fromStream(backoffs)) {
case Throw(_: TimeoutException) => true
}
/**
* Retry on com.twitter.finagle.service.RetryableWriteExceptions
*/
def writes[A](backoffs: Stream[Duration]): RetryPolicy[Try[A]] =
RetryPolicy.backoff[Try[A]](Backoff.fromStream(backoffs)) {
case Throw(RetryableWriteException(_)) => true
}
/**
* Retry on com.twitter.servo.exception.thriftscala.ServerError
*/
def servoServerError[A](backoffs: Stream[Duration]): RetryPolicy[Try[A]] =
RetryPolicy.backoff[Try[A]](Backoff.fromStream(backoffs)) {
case Throw(ServerError(_)) => true
}
}

View file

@ -0,0 +1,54 @@
package com.twitter.tweetypie.util
import com.twitter.finagle.stats.Stat
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.servo
import com.twitter.util.Return
import com.twitter.util.Throw
import com.twitter.stitch.Stitch
object StitchUtils {
def trackLatency[T](latencyStat: Stat, s: => Stitch[T]): Stitch[T] = {
Stitch
.time(s)
.map {
case (res, duration) =>
latencyStat.add(duration.inMillis)
res
}
.lowerFromTry
}
def observe[T](statsReceiver: StatsReceiver, apiName: String): Stitch[T] => Stitch[T] = {
val stats = statsReceiver.scope(apiName)
val requests = stats.counter("requests")
val success = stats.counter("success")
val latencyStat = stats.stat("latency_ms")
val exceptionCounter =
new servo.util.ExceptionCounter(stats, "failures")
stitch =>
trackLatency(latencyStat, stitch)
.respond {
case Return(_) =>
requests.incr()
success.incr()
case Throw(e) =>
exceptionCounter(e)
requests.incr()
}
}
def translateExceptions[T](
stitch: Stitch[T],
translateException: PartialFunction[Throwable, Throwable]
): Stitch[T] =
stitch.rescue {
case t if translateException.isDefinedAt(t) =>
Stitch.exception(translateException(t))
case t => Stitch.exception(t)
}
}

View file

@ -0,0 +1,31 @@
package com.twitter.tweetypie.util
/**
* Escape a String into Java or Scala String literal syntax (adds the
* surrounding quotes.)
*
* This is primarily for printing Strings for debugging or logging.
*/
object StringLiteral extends (String => String) {
private[this] val ControlLimit = ' '
private[this] val PrintableLimit = '\u007e'
private[this] val Specials =
Map('\n' -> 'n', '\r' -> 'r', '\t' -> 't', '"' -> '"', '\'' -> '\'', '\\' -> '\\')
def apply(str: String): String = {
val s = new StringBuilder(str.length)
s.append('"')
var i = 0
while (i < str.length) {
val c = str(i)
Specials.get(c) match {
case None =>
if (c >= ControlLimit && c <= PrintableLimit) s.append(c)
else s.append("\\u%04x".format(c.toInt))
case Some(special) => s.append('\\').append(special)
}
i += 1
}
s.append('"').result
}
}

View file

@ -0,0 +1,49 @@
package com.twitter.tweetypie.util
import com.twitter.takedown.util.TakedownReasons
import com.twitter.takedown.util.TakedownReasons.CountryCode
import com.twitter.tseng.withholding.thriftscala.TakedownReason
import com.twitter.tseng.withholding.thriftscala.UnspecifiedReason
import com.twitter.tweetypie.thriftscala.Tweet
/**
* Contains tweetypie-specific utils for working with TakedownReasons.
*/
object Takedowns {
type CountryCode = String
/**
* Take a list of [[TakedownReason]] and return values to be saved on the [[Tweet]] in fields
* tweetypieOnlyTakedownCountryCode and tweetypieOnlyTakedownReason.
*
* - tweetypieOnlyTakedownCountryCode contains the country_code of all UnspecifiedReasons
* - tweetypieOnlyTakedownReason contains all other reasons
*/
def partitionReasons(reasons: Seq[TakedownReason]): (Seq[String], Seq[TakedownReason]) = {
val (unspecifiedReasons, specifiedReasons) = reasons.partition {
case TakedownReason.UnspecifiedReason(UnspecifiedReason(_)) => true
case _ => false
}
val unspecifiedCountryCodes = unspecifiedReasons.collect(TakedownReasons.reasonToCountryCode)
(unspecifiedCountryCodes, specifiedReasons)
}
def fromTweet(t: Tweet): Takedowns =
Takedowns(
Seq
.concat(
t.tweetypieOnlyTakedownCountryCodes
.getOrElse(Nil).map(TakedownReasons.countryCodeToReason),
t.tweetypieOnlyTakedownReasons.getOrElse(Nil)
).toSet
)
}
/**
* This class is used to ensure the caller has access to both the full list of reasons as well
* as the backwards-compatible list of country codes.
*/
case class Takedowns(reasons: Set[TakedownReason]) {
def countryCodes: Set[CountryCode] = reasons.collect(TakedownReasons.reasonToCountryCode)
}

View file

@ -0,0 +1,17 @@
package com.twitter.tweetypie.util
import com.twitter.tweetypie.thriftscala.TransientCreateContext
import com.twitter.tweetypie.thriftscala.TweetCreateContextKey
import com.twitter.tweetypie.thriftscala.TweetCreateContextKey.PeriscopeCreatorId
import com.twitter.tweetypie.thriftscala.TweetCreateContextKey.PeriscopeIsLive
object TransientContextUtil {
def toAdditionalContext(context: TransientCreateContext): Map[TweetCreateContextKey, String] =
Seq
.concat(
context.periscopeIsLive.map(PeriscopeIsLive -> _.toString), // "true" or "false"
context.periscopeCreatorId.map(PeriscopeCreatorId -> _.toString) // userId
)
.toMap
}

View file

@ -0,0 +1,203 @@
package com.twitter.tweetypie.util
import com.twitter.conversions.DurationOps._
import com.twitter.logging.Logger
import com.twitter.mediaservices.commons.mediainformation.thriftscala.UserDefinedProductMetadata
import com.twitter.scrooge.BinaryThriftStructSerializer
import com.twitter.servo.cache.ScopedCacheKey
import com.twitter.servo.util.Transformer
import com.twitter.tweetypie.thriftscala.PostTweetRequest
import com.twitter.util.Base64Long
import com.twitter.util.Time
import java.nio.ByteBuffer
import java.security.MessageDigest
import org.apache.commons.codec.binary.Base64
import scala.collection.immutable.SortedMap
object TweetCreationLock {
case class Key private (userId: UserId, typeCode: String, idOrMd5: String)
extends ScopedCacheKey("t", "locker", 2, Base64Long.toBase64(userId), typeCode, idOrMd5) {
def uniquenessId: Option[String] =
if (typeCode == Key.TypeCode.UniquenessId) Some(idOrMd5) else None
}
object Key {
private[this] val log = Logger(getClass)
object TypeCode {
val SourceTweetId = "r"
val UniquenessId = "u"
val PostTweetRequest = "p"
}
private[this] val serializer = BinaryThriftStructSerializer(PostTweetRequest)
// normalize the representation of no media ids.
private[util] def sanitizeMediaUploadIds(mediaUploadIds: Option[Seq[Long]]) =
mediaUploadIds.filter(_.nonEmpty)
/**
* Request deduplication depends on the hash of a serialized Thrift value.
*
* In order to guarantee that a Map has a reproducible serialized form,
* it's necessary to fix the ordering of its keys.
*/
private[util] def sanitizeMediaMetadata(
mediaMetadata: Option[scala.collection.Map[MediaId, UserDefinedProductMetadata]]
): Option[scala.collection.Map[MediaId, UserDefinedProductMetadata]] =
mediaMetadata.map(m => SortedMap(m.toSeq: _*))
/**
* Make sure to sanitize request fields with map/set since serialized
* bytes ordering is not guaranteed for same thrift values.
*/
private[util] def sanitizeRequest(request: PostTweetRequest): PostTweetRequest =
PostTweetRequest(
userId = request.userId,
text = request.text,
createdVia = "",
inReplyToTweetId = request.inReplyToTweetId,
geo = request.geo,
mediaUploadIds = sanitizeMediaUploadIds(request.mediaUploadIds),
narrowcast = request.narrowcast,
nullcast = request.nullcast,
additionalFields = request.additionalFields,
attachmentUrl = request.attachmentUrl,
mediaMetadata = sanitizeMediaMetadata(request.mediaMetadata),
conversationControl = request.conversationControl,
underlyingCreativesContainerId = request.underlyingCreativesContainerId,
editOptions = request.editOptions,
noteTweetOptions = request.noteTweetOptions
)
def bySourceTweetId(userId: UserId, sourceTweetId: TweetId): Key =
Key(userId, TypeCode.SourceTweetId, Base64Long.toBase64(sourceTweetId))
def byRequest(request: PostTweetRequest): Key =
request.uniquenessId match {
case Some(uqid) =>
byUniquenessId(request.userId, uqid)
case None =>
val sanitized = sanitizeRequest(request)
val sanitizedBytes = serializer.toBytes(sanitized)
val digested = MessageDigest.getInstance("SHA-256").digest(sanitizedBytes)
val base64Digest = Base64.encodeBase64String(digested)
val key = Key(request.userId, TypeCode.PostTweetRequest, base64Digest)
log.ifDebug(s"Generated key $key from request:\n${sanitized}")
key
}
/**
* Key for tweets that have a uniqueness id set. There is only one
* namespace of uniqueness ids, across all clients. They are
* expected to be Snowflake ids, in order to avoid cache
* collisions.
*/
def byUniquenessId(userId: UserId, uniquenessId: Long): Key =
Key(userId, TypeCode.UniquenessId, Base64Long.toBase64(uniquenessId))
}
/**
* The state of tweet creation for a given Key (request).
*/
sealed trait State
object State {
/**
* There is no tweet creation currently in progress. (This can
* either be represented by no entry in the cache, or this special
* marker. This lets us use checkAndSet for deletion to avoid
* accidentally overwriting other process' values.)
*/
case object Unlocked extends State
/**
* Some process is attempting to create the tweet.
*/
case class InProgress(token: Long, timestamp: Time) extends State
/**
* The tweet has already been successfully created, and has the
* specified id.
*/
case class AlreadyCreated(tweetId: TweetId, timestamp: Time) extends State
/**
* When stored in cache, each state is prefixed by a byte
* indicating the type of the entry.
*/
object TypeCode {
val Unlocked: Byte = 0.toByte
val InProgress: Byte = 1.toByte // + random long + timestamp
val AlreadyCreated: Byte = 2.toByte // + tweet id + timestamp
}
private[this] val BufferSize = 17 // type byte + 64-bit value + 64-bit timestamp
// Constant buffer to use for storing the serialized form on
// Unlocked.
private[this] val UnlockedBuf = Array[Byte](TypeCode.Unlocked)
// Store the serialization function in a ThreadLocal so that we can
// reuse the buffer between invocations.
private[this] val threadLocalSerialize = new ThreadLocal[State => Array[Byte]] {
override def initialValue(): State => Array[Byte] = {
// Allocate the thread-local state
val ary = new Array[Byte](BufferSize)
val buf = ByteBuffer.wrap(ary)
{
case Unlocked => UnlockedBuf
case InProgress(token, timestamp) =>
buf.clear()
buf
.put(TypeCode.InProgress)
.putLong(token)
.putLong(timestamp.sinceEpoch.inNanoseconds)
ary
case AlreadyCreated(tweetId, timestamp) =>
buf.clear()
buf
.put(TypeCode.AlreadyCreated)
.putLong(tweetId)
.putLong(timestamp.sinceEpoch.inNanoseconds)
ary
}
}
}
/**
* Convert this State to the cache representation.
*/
private[this] def toBytes(state: State): Array[Byte] =
threadLocalSerialize.get()(state)
/**
* Convert this byte array into a LockState.
*
* @throws RuntimeException if the buffer is not of the right size
* and format
*/
private[this] def fromBytes(bytes: Array[Byte]): State = {
val buf = ByteBuffer.wrap(bytes)
val result = buf.get() match {
case TypeCode.Unlocked => Unlocked
case TypeCode.InProgress => InProgress(buf.getLong(), buf.getLong().nanoseconds.afterEpoch)
case TypeCode.AlreadyCreated =>
AlreadyCreated(buf.getLong(), buf.getLong().nanoseconds.afterEpoch)
case other => throw new RuntimeException("Invalid type code: " + other)
}
if (buf.remaining != 0) {
throw new RuntimeException("Extra data in buffer: " + bytes)
}
result
}
/**
* How to serialize the State for storage in cache.
*/
val Serializer: Transformer[State, Array[Byte]] =
Transformer[State, Array[Byte]](tTo = toBytes _, tFrom = fromBytes _)
}
}

View file

@ -0,0 +1,506 @@
package com.twitter.tweetypie.util
import com.twitter.dataproducts.enrichments.thriftscala.ProfileGeoEnrichment
import com.twitter.expandodo.thriftscala._
import com.twitter.mediaservices.commons.thriftscala.MediaKey
import com.twitter.mediaservices.commons.tweetmedia.thriftscala._
import com.twitter.servo.data.Lens
import com.twitter.spam.rtf.thriftscala.SafetyLabel
import com.twitter.tseng.withholding.thriftscala.TakedownReason
import com.twitter.tweetypie.thriftscala._
import com.twitter.tweetypie.unmentions.thriftscala.UnmentionData
object TweetLenses {
import Lens.checkEq
def requireSome[A, B](l: Lens[A, Option[B]]): Lens[A, B] =
checkEq[A, B](
a => l.get(a).get,
(a, b) => l.set(a, Some(b))
)
def tweetLens[A](get: Tweet => A, set: (Tweet, A) => Tweet): Lens[Tweet, A] =
checkEq[Tweet, A](get, set)
val id: Lens[Tweet, TweetId] =
tweetLens[TweetId](_.id, (t, id) => t.copy(id = id))
val coreData: Lens[Tweet, Option[TweetCoreData]] =
tweetLens[Option[TweetCoreData]](_.coreData, (t, coreData) => t.copy(coreData = coreData))
val requiredCoreData: Lens[Tweet, TweetCoreData] =
requireSome(coreData)
val optUrls: Lens[Tweet, Option[Seq[UrlEntity]]] =
tweetLens[Option[Seq[UrlEntity]]](_.urls, (t, urls) => t.copy(urls = urls))
val urls: Lens[Tweet, Seq[UrlEntity]] =
tweetLens[Seq[UrlEntity]](_.urls.toSeq.flatten, (t, urls) => t.copy(urls = Some(urls)))
val optMentions: Lens[Tweet, Option[Seq[MentionEntity]]] =
tweetLens[Option[Seq[MentionEntity]]](_.mentions, (t, v) => t.copy(mentions = v))
val mentions: Lens[Tweet, Seq[MentionEntity]] =
tweetLens[Seq[MentionEntity]](_.mentions.toSeq.flatten, (t, v) => t.copy(mentions = Some(v)))
val unmentionData: Lens[Tweet, Option[UnmentionData]] =
tweetLens[Option[UnmentionData]](_.unmentionData, (t, v) => t.copy(unmentionData = v))
val optHashtags: Lens[Tweet, Option[Seq[HashtagEntity]]] =
tweetLens[Option[Seq[HashtagEntity]]](_.hashtags, (t, v) => t.copy(hashtags = v))
val hashtags: Lens[Tweet, Seq[HashtagEntity]] =
tweetLens[Seq[HashtagEntity]](_.hashtags.toSeq.flatten, (t, v) => t.copy(hashtags = Some(v)))
val optCashtags: Lens[Tweet, Option[Seq[CashtagEntity]]] =
tweetLens[Option[Seq[CashtagEntity]]](_.cashtags, (t, v) => t.copy(cashtags = v))
val cashtags: Lens[Tweet, Seq[CashtagEntity]] =
tweetLens[Seq[CashtagEntity]](_.cashtags.toSeq.flatten, (t, v) => t.copy(cashtags = Some(v)))
val optMedia: Lens[Tweet, Option[Seq[MediaEntity]]] =
tweetLens[Option[Seq[MediaEntity]]](_.media, (t, v) => t.copy(media = v))
val media: Lens[Tweet, Seq[MediaEntity]] =
tweetLens[Seq[MediaEntity]](_.media.toSeq.flatten, (t, v) => t.copy(media = Some(v)))
val mediaKeys: Lens[Tweet, Seq[MediaKey]] =
tweetLens[Seq[MediaKey]](
_.mediaKeys.toSeq.flatten,
{
case (t, v) => t.copy(mediaKeys = Some(v))
})
val place: Lens[Tweet, Option[Place]] =
tweetLens[Option[Place]](
_.place,
{
case (t, v) => t.copy(place = v)
})
val quotedTweet: Lens[Tweet, Option[QuotedTweet]] =
tweetLens[Option[QuotedTweet]](
_.quotedTweet,
{
case (t, v) => t.copy(quotedTweet = v)
})
val selfThreadMetadata: Lens[Tweet, Option[SelfThreadMetadata]] =
tweetLens[Option[SelfThreadMetadata]](
_.selfThreadMetadata,
{
case (t, v) => t.copy(selfThreadMetadata = v)
})
val composerSource: Lens[Tweet, Option[ComposerSource]] =
tweetLens[Option[ComposerSource]](
_.composerSource,
{
case (t, v) => t.copy(composerSource = v)
})
val deviceSource: Lens[Tweet, Option[DeviceSource]] =
tweetLens[Option[DeviceSource]](
_.deviceSource,
{
case (t, v) => t.copy(deviceSource = v)
})
val perspective: Lens[Tweet, Option[StatusPerspective]] =
tweetLens[Option[StatusPerspective]](
_.perspective,
{
case (t, v) => t.copy(perspective = v)
})
val cards: Lens[Tweet, Option[Seq[Card]]] =
tweetLens[Option[Seq[Card]]](
_.cards,
{
case (t, v) => t.copy(cards = v)
})
val card2: Lens[Tweet, Option[Card2]] =
tweetLens[Option[Card2]](
_.card2,
{
case (t, v) => t.copy(card2 = v)
})
val cardReference: Lens[Tweet, Option[CardReference]] =
tweetLens[Option[CardReference]](
_.cardReference,
{
case (t, v) => t.copy(cardReference = v)
})
val spamLabel: Lens[Tweet, Option[SafetyLabel]] =
tweetLens[Option[SafetyLabel]](
_.spamLabel,
{
case (t, v) => t.copy(spamLabel = v)
})
val lowQualityLabel: Lens[Tweet, Option[SafetyLabel]] =
tweetLens[Option[SafetyLabel]](
_.lowQualityLabel,
{
case (t, v) => t.copy(lowQualityLabel = v)
})
val nsfwHighPrecisionLabel: Lens[Tweet, Option[SafetyLabel]] =
tweetLens[Option[SafetyLabel]](
_.nsfwHighPrecisionLabel,
{
case (t, v) => t.copy(nsfwHighPrecisionLabel = v)
})
val bounceLabel: Lens[Tweet, Option[SafetyLabel]] =
tweetLens[Option[SafetyLabel]](
_.bounceLabel,
{
case (t, v) => t.copy(bounceLabel = v)
})
val takedownCountryCodes: Lens[Tweet, Option[Seq[String]]] =
tweetLens[Option[Seq[String]]](
_.takedownCountryCodes,
{
case (t, v) => t.copy(takedownCountryCodes = v)
})
val takedownReasons: Lens[Tweet, Option[Seq[TakedownReason]]] =
tweetLens[Option[Seq[TakedownReason]]](
_.takedownReasons,
{
case (t, v) => t.copy(takedownReasons = v)
})
val contributor: Lens[Tweet, Option[Contributor]] =
tweetLens[Option[Contributor]](
_.contributor,
{
case (t, v) => t.copy(contributor = v)
})
val mediaTags: Lens[Tweet, Option[TweetMediaTags]] =
tweetLens[Option[TweetMediaTags]](
_.mediaTags,
{
case (t, v) => t.copy(mediaTags = v)
})
val mediaTagMap: Lens[Tweet, Map[MediaId, Seq[MediaTag]]] =
tweetLens[Map[MediaId, Seq[MediaTag]]](
_.mediaTags.map { case TweetMediaTags(tagMap) => tagMap.toMap }.getOrElse(Map.empty),
(t, v) => {
val cleanMap = v.filter { case (_, tags) => tags.nonEmpty }
t.copy(mediaTags = if (cleanMap.nonEmpty) Some(TweetMediaTags(cleanMap)) else None)
}
)
val escherbirdEntityAnnotations: Lens[Tweet, Option[EscherbirdEntityAnnotations]] =
tweetLens[Option[EscherbirdEntityAnnotations]](
_.escherbirdEntityAnnotations,
{
case (t, v) => t.copy(escherbirdEntityAnnotations = v)
})
val communities: Lens[Tweet, Option[Communities]] =
tweetLens[Option[Communities]](
_.communities,
{
case (t, v) => t.copy(communities = v)
})
val tweetypieOnlyTakedownCountryCodes: Lens[Tweet, Option[Seq[String]]] =
tweetLens[Option[Seq[String]]](
_.tweetypieOnlyTakedownCountryCodes,
{
case (t, v) => t.copy(tweetypieOnlyTakedownCountryCodes = v)
})
val tweetypieOnlyTakedownReasons: Lens[Tweet, Option[Seq[TakedownReason]]] =
tweetLens[Option[Seq[TakedownReason]]](
_.tweetypieOnlyTakedownReasons,
{
case (t, v) => t.copy(tweetypieOnlyTakedownReasons = v)
})
val profileGeo: Lens[Tweet, Option[ProfileGeoEnrichment]] =
tweetLens[Option[ProfileGeoEnrichment]](
_.profileGeoEnrichment,
(t, v) => t.copy(profileGeoEnrichment = v)
)
val visibleTextRange: Lens[Tweet, Option[TextRange]] =
tweetLens[Option[TextRange]](
_.visibleTextRange,
{
case (t, v) => t.copy(visibleTextRange = v)
})
val selfPermalink: Lens[Tweet, Option[ShortenedUrl]] =
tweetLens[Option[ShortenedUrl]](
_.selfPermalink,
{
case (t, v) => t.copy(selfPermalink = v)
})
val extendedTweetMetadata: Lens[Tweet, Option[ExtendedTweetMetadata]] =
tweetLens[Option[ExtendedTweetMetadata]](
_.extendedTweetMetadata,
{
case (t, v) => t.copy(extendedTweetMetadata = v)
})
object TweetCoreData {
val userId: Lens[TweetCoreData, UserId] = checkEq[TweetCoreData, UserId](
_.userId,
{ (c, v) =>
// Pleases the compiler: https://github.com/scala/bug/issues/9171
val userId = v
c.copy(userId = userId)
})
val text: Lens[TweetCoreData, String] = checkEq[TweetCoreData, String](
_.text,
{ (c, v) =>
// Pleases the compiler: https://github.com/scala/bug/issues/9171
val text = v
c.copy(text = text)
})
val createdAt: Lens[TweetCoreData, TweetId] =
checkEq[TweetCoreData, Long](_.createdAtSecs, (c, v) => c.copy(createdAtSecs = v))
val createdVia: Lens[TweetCoreData, String] =
checkEq[TweetCoreData, String](
_.createdVia,
{
case (c, v) => c.copy(createdVia = v)
})
val hasTakedown: Lens[TweetCoreData, Boolean] =
checkEq[TweetCoreData, Boolean](
_.hasTakedown,
{
case (c, v) => c.copy(hasTakedown = v)
})
val nullcast: Lens[TweetCoreData, Boolean] =
checkEq[TweetCoreData, Boolean](
_.nullcast,
{
case (c, v) => c.copy(nullcast = v)
})
val nsfwUser: Lens[TweetCoreData, Boolean] =
checkEq[TweetCoreData, Boolean](
_.nsfwUser,
{
case (c, v) => c.copy(nsfwUser = v)
})
val nsfwAdmin: Lens[TweetCoreData, Boolean] =
checkEq[TweetCoreData, Boolean](
_.nsfwAdmin,
{
case (c, v) => c.copy(nsfwAdmin = v)
})
val reply: Lens[TweetCoreData, Option[Reply]] =
checkEq[TweetCoreData, Option[Reply]](
_.reply,
{
case (c, v) => c.copy(reply = v)
})
val share: Lens[TweetCoreData, Option[Share]] =
checkEq[TweetCoreData, Option[Share]](
_.share,
{
case (c, v) => c.copy(share = v)
})
val narrowcast: Lens[TweetCoreData, Option[Narrowcast]] =
checkEq[TweetCoreData, Option[Narrowcast]](
_.narrowcast,
{
case (c, v) => c.copy(narrowcast = v)
})
val directedAtUser: Lens[TweetCoreData, Option[DirectedAtUser]] =
checkEq[TweetCoreData, Option[DirectedAtUser]](
_.directedAtUser,
{
case (c, v) => c.copy(directedAtUser = v)
})
val conversationId: Lens[TweetCoreData, Option[ConversationId]] =
checkEq[TweetCoreData, Option[ConversationId]](
_.conversationId,
{
case (c, v) => c.copy(conversationId = v)
})
val placeId: Lens[TweetCoreData, Option[String]] =
checkEq[TweetCoreData, Option[String]](
_.placeId,
{
case (c, v) => c.copy(placeId = v)
})
val geoCoordinates: Lens[TweetCoreData, Option[GeoCoordinates]] =
checkEq[TweetCoreData, Option[GeoCoordinates]](
_.coordinates,
(c, v) => c.copy(coordinates = v)
)
val trackingId: Lens[TweetCoreData, Option[TweetId]] =
checkEq[TweetCoreData, Option[Long]](
_.trackingId,
{
case (c, v) => c.copy(trackingId = v)
})
val hasMedia: Lens[TweetCoreData, Option[Boolean]] =
checkEq[TweetCoreData, Option[Boolean]](
_.hasMedia,
{
case (c, v) => c.copy(hasMedia = v)
})
}
val counts: Lens[Tweet, Option[StatusCounts]] =
tweetLens[Option[StatusCounts]](
_.counts,
{
case (t, v) => t.copy(counts = v)
})
object StatusCounts {
val retweetCount: Lens[StatusCounts, Option[TweetId]] =
checkEq[StatusCounts, Option[Long]](
_.retweetCount,
(c, retweetCount) => c.copy(retweetCount = retweetCount)
)
val replyCount: Lens[StatusCounts, Option[TweetId]] =
checkEq[StatusCounts, Option[Long]](
_.replyCount,
(c, replyCount) => c.copy(replyCount = replyCount)
)
val favoriteCount: Lens[StatusCounts, Option[TweetId]] =
checkEq[StatusCounts, Option[Long]](
_.favoriteCount,
{
case (c, v) => c.copy(favoriteCount = v)
})
val quoteCount: Lens[StatusCounts, Option[TweetId]] =
checkEq[StatusCounts, Option[Long]](
_.quoteCount,
{
case (c, v) => c.copy(quoteCount = v)
})
}
val userId: Lens[Tweet, UserId] = requiredCoreData andThen TweetCoreData.userId
val text: Lens[Tweet, String] = requiredCoreData andThen TweetCoreData.text
val createdVia: Lens[Tweet, String] = requiredCoreData andThen TweetCoreData.createdVia
val createdAt: Lens[Tweet, ConversationId] = requiredCoreData andThen TweetCoreData.createdAt
val reply: Lens[Tweet, Option[Reply]] = requiredCoreData andThen TweetCoreData.reply
val share: Lens[Tweet, Option[Share]] = requiredCoreData andThen TweetCoreData.share
val narrowcast: Lens[Tweet, Option[Narrowcast]] =
requiredCoreData andThen TweetCoreData.narrowcast
val directedAtUser: Lens[Tweet, Option[DirectedAtUser]] =
requiredCoreData andThen TweetCoreData.directedAtUser
val conversationId: Lens[Tweet, Option[ConversationId]] =
requiredCoreData andThen TweetCoreData.conversationId
val placeId: Lens[Tweet, Option[String]] = requiredCoreData andThen TweetCoreData.placeId
val geoCoordinates: Lens[Tweet, Option[GeoCoordinates]] =
requiredCoreData andThen TweetCoreData.geoCoordinates
val hasTakedown: Lens[Tweet, Boolean] = requiredCoreData andThen TweetCoreData.hasTakedown
val nsfwAdmin: Lens[Tweet, Boolean] = requiredCoreData andThen TweetCoreData.nsfwAdmin
val nsfwUser: Lens[Tweet, Boolean] = requiredCoreData andThen TweetCoreData.nsfwUser
val nullcast: Lens[Tweet, Boolean] = requiredCoreData andThen TweetCoreData.nullcast
val trackingId: Lens[Tweet, Option[ConversationId]] =
requiredCoreData andThen TweetCoreData.trackingId
val hasMedia: Lens[Tweet, Option[Boolean]] = requiredCoreData andThen TweetCoreData.hasMedia
object CashtagEntity {
val indices: Lens[CashtagEntity, (Short, Short)] =
checkEq[CashtagEntity, (Short, Short)](
t => (t.fromIndex, t.toIndex),
(t, v) => t.copy(fromIndex = v._1, toIndex = v._2)
)
val text: Lens[CashtagEntity, String] =
checkEq[CashtagEntity, String](_.text, (t, text) => t.copy(text = text))
}
object HashtagEntity {
val indices: Lens[HashtagEntity, (Short, Short)] =
checkEq[HashtagEntity, (Short, Short)](
t => (t.fromIndex, t.toIndex),
(t, v) => t.copy(fromIndex = v._1, toIndex = v._2)
)
val text: Lens[HashtagEntity, String] =
checkEq[HashtagEntity, String](_.text, (t, text) => t.copy(text = text))
}
object MediaEntity {
val indices: Lens[MediaEntity, (Short, Short)] =
checkEq[MediaEntity, (Short, Short)](
t => (t.fromIndex, t.toIndex),
(t, v) => t.copy(fromIndex = v._1, toIndex = v._2)
)
val mediaSizes: Lens[MediaEntity, collection.Set[MediaSize]] =
checkEq[MediaEntity, scala.collection.Set[MediaSize]](
_.sizes,
(m, sizes) => m.copy(sizes = sizes)
)
val url: Lens[MediaEntity, String] =
checkEq[MediaEntity, String](
_.url,
{
case (t, v) => t.copy(url = v)
})
val mediaInfo: Lens[MediaEntity, Option[MediaInfo]] =
checkEq[MediaEntity, Option[MediaInfo]](
_.mediaInfo,
{
case (t, v) => t.copy(mediaInfo = v)
})
}
object MentionEntity {
val indices: Lens[MentionEntity, (Short, Short)] =
checkEq[MentionEntity, (Short, Short)](
t => (t.fromIndex, t.toIndex),
(t, v) => t.copy(fromIndex = v._1, toIndex = v._2)
)
val screenName: Lens[MentionEntity, String] =
checkEq[MentionEntity, String](
_.screenName,
(t, screenName) => t.copy(screenName = screenName)
)
}
object UrlEntity {
val indices: Lens[UrlEntity, (Short, Short)] =
checkEq[UrlEntity, (Short, Short)](
t => (t.fromIndex, t.toIndex),
(t, v) => t.copy(fromIndex = v._1, toIndex = v._2)
)
val url: Lens[UrlEntity, String] =
checkEq[UrlEntity, String](_.url, (t, url) => t.copy(url = url))
}
object Contributor {
val screenName: Lens[Contributor, Option[String]] =
checkEq[Contributor, Option[String]](
_.screenName,
(c, screenName) => c.copy(screenName = screenName)
)
}
object Reply {
val inReplyToScreenName: Lens[Reply, Option[String]] =
checkEq[Reply, Option[String]](
_.inReplyToScreenName,
(c, inReplyToScreenName) => c.copy(inReplyToScreenName = inReplyToScreenName)
)
val inReplyToStatusId: Lens[Reply, Option[TweetId]] =
checkEq[Reply, Option[TweetId]](
_.inReplyToStatusId,
(c, inReplyToStatusId) => c.copy(inReplyToStatusId = inReplyToStatusId)
)
}
}

View file

@ -0,0 +1,18 @@
package com.twitter.tweetypie.util
import com.twitter.tweetutil.TweetPermalink
import com.twitter.tweetypie.thriftscala._
object TweetPermalinkUtil {
def lastQuotedTweetPermalink(tweet: Tweet): Option[(UrlEntity, TweetPermalink)] =
lastQuotedTweetPermalink(TweetLenses.urls.get(tweet))
def lastQuotedTweetPermalink(urls: Seq[UrlEntity]): Option[(UrlEntity, TweetPermalink)] =
urls.flatMap(matchQuotedTweetPermalink).lastOption
def matchQuotedTweetPermalink(entity: UrlEntity): Option[(UrlEntity, TweetPermalink)] =
for {
expanded <- entity.expanded
permalink <- TweetPermalink.parse(expanded)
} yield (entity, permalink)
}

Some files were not shown because too many files have changed in this diff Show more