twitter-algorithm-ml/metrics/auroc.py

"""
AUROC metrics.
"""
from typing import Union

from tml.ml_logging.torch_logging import logging

import torch
import torchmetrics
from torchmetrics.utilities.data import dim_zero_cat


def _compute_helper(
  predictions: torch.Tensor,
  target: torch.Tensor,
  weights: torch.Tensor,
  max_positive_negative_weighted_sum: torch.Tensor,
  min_positive_negative_weighted_sum: torch.Tensor,
  equal_predictions_as_incorrect: bool,
) -> torch.Tensor:
  """
  Compute AUROC.
  Args:
    predictions: The predictions probabilities.
    target: The target.
    weights: The sample weights to assign to each sample in the batch.
    max_positive_negative_weighted_sum: The sum of the weights for the positive labels.
    min_positive_negative_weighted_sum:
    equal_predictions_as_incorrect: For positive & negative labels having identical scores,
     we assume that they are correct prediction (i.e weight = 1) when ths is False. Otherwise,
     we assume that they are correct prediction (i.e weight = 0).
  """
  dim = 0

  # Sort predictions based on key (score, true_label). The order is ascending for score.
  # For true_label, order is ascending if equal_predictions_as_incorrect is True;
  # otherwise it is descending.
  target_order = torch.argsort(target, dim=dim, descending=equal_predictions_as_incorrect)
  score_order = torch.sort(torch.gather(predictions, dim, target_order), stable=True, dim=dim)[1]
  score_order = torch.gather(target_order, dim, score_order)
  sorted_target = torch.gather(target, dim, score_order)
  sorted_weights = torch.gather(weights, dim, score_order)

  negatives_from_left = torch.cumsum((1.0 - sorted_target) * sorted_weights, 0)

  numerator = torch.sum(
    sorted_weights * (sorted_target * negatives_from_left / max_positive_negative_weighted_sum)
  )

  return numerator / min_positive_negative_weighted_sum


class AUROCWithMWU(torchmetrics.Metric):
  """
  AUROC using Mann-Whitney U-test.
  See https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve.

  This AUROC implementation is well suited to (non-zero) low-CTR. In particular it will return
  the correct AUROC even if the predicted probabilities are all close to 0.
  Currently only support binary classification.
  """

  def __init__(self, label_threshold: float = 0.5, raise_missing_class: bool = False, **kwargs):
    """

    Args:
      label_threshold: Labels strictly above this threshold are considered positive labels,
                       otherwise, they are considered negative.
      raise_missing_class: If True, an error will be raise if negative or positive class is missing.
        Otherwise, we will simply log a warning.
      **kwargs: Additional parameters supported by all torchmetrics.Metric.
    """
    super().__init__(**kwargs)
    self.add_state("predictions", default=[], dist_reduce_fx="cat")
    self.add_state("target", default=[], dist_reduce_fx="cat")
    self.add_state("weights", default=[], dist_reduce_fx="cat")

    self.label_threshold = label_threshold
    self.raise_missing_class = raise_missing_class

  def update(
    self,
    predictions: torch.Tensor,
    target: torch.Tensor,
    weight: Union[float, torch.Tensor] = 1.0,
  ) -> None:
    """
    Update the current auroc.
    Args:
      predictions: Predicted values, 1D Tensor or 2D Tensor of shape batch_size x 1.
      target: Ground truth. Must have same shape as predictions.
      weight: The weight to use for the predicted values. Shape should be
      broadcastable to that of predictions.
    """
    self.predictions.append(predictions)
    self.target.append(target)
    if not isinstance(weight, torch.Tensor):
      weight = torch.as_tensor(weight, dtype=predictions.dtype, device=target.device)
    self.weights.append(torch.broadcast_to(weight, predictions.size()))

  def compute(self) -> torch.Tensor:
    """
    Compute and return the accumulated AUROC.
    """
    weights = dim_zero_cat(self.weights)
    predictions = dim_zero_cat(self.predictions)
    target = dim_zero_cat(self.target).type_as(predictions)

    negative_mask = target <= self.label_threshold
    positive_mask = torch.logical_not(negative_mask)

    if not negative_mask.any():
      msg = "Negative class missing. AUROC returned will be meaningless."
      if self.raise_missing_class:
        raise ValueError(msg)
      else:
        logging.warn(msg)
    if not positive_mask.any():
      msg = "Positive class missing. AUROC returned will be meaningless."
      if self.raise_missing_class:
        raise ValueError(msg)
      else:
        logging.warn(msg)

    weighted_actual_negative_sum = torch.sum(
      torch.where(negative_mask, weights, torch.zeros_like(weights))
    )

    weighted_actual_positive_sum = torch.sum(
      torch.where(positive_mask, weights, torch.zeros_like(weights))
    )

    max_positive_negative_weighted_sum = torch.max(
      weighted_actual_negative_sum, weighted_actual_positive_sum
    )

    min_positive_negative_weighted_sum = torch.min(
      weighted_actual_negative_sum, weighted_actual_positive_sum
    )

    # Compute auroc with the weight set to 1 when positive & negative have identical scores.
    auroc_le = _compute_helper(
      target=target,
      weights=weights,
      predictions=predictions,
      min_positive_negative_weighted_sum=min_positive_negative_weighted_sum,
      max_positive_negative_weighted_sum=max_positive_negative_weighted_sum,
      equal_predictions_as_incorrect=False,
    )

    # Compute auroc with the weight set to 0 when positive & negative have identical scores.
    auroc_lt = _compute_helper(
      target=target,
      weights=weights,
      predictions=predictions,
      min_positive_negative_weighted_sum=min_positive_negative_weighted_sum,
      max_positive_negative_weighted_sum=max_positive_negative_weighted_sum,
      equal_predictions_as_incorrect=True,
    )

    # Compute auroc with the weight set to 1/2 when positive & negative have identical scores.
    return auroc_le - (auroc_le - auroc_lt) / 2.0