diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c
index bce695753..fbd5eadde 100644
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -575,6 +575,61 @@ BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
     }
 }
 
+/* fast ARGB888->(A)BGR888 blending with pixel alpha */
+static void
+BlitRGBtoBGRPixelAlpha(SDL_BlitInfo * info)
+{
+    int width = info->dst_w;
+    int height = info->dst_h;
+    Uint32 *srcp = (Uint32 *) info->src;
+    int srcskip = info->src_skip >> 2;
+    Uint32 *dstp = (Uint32 *) info->dst;
+    int dstskip = info->dst_skip >> 2;
+
+    while (height--) {
+        /* *INDENT-OFF* */
+        DUFFS_LOOP4({
+        Uint32 dalpha;
+        Uint32 d;
+        Uint32 s1;
+        Uint32 d1;
+        Uint32 s = *srcp;
+        Uint32 alpha = s >> 24;
+        /* FIXME: Here we special-case opaque alpha since the
+           compositioning used (>>8 instead of /255) doesn't handle
+           it correctly. Also special-case alpha=0 for speed?
+           Benchmark this! */
+        if (alpha) {
+          /*
+           * take out the middle component (green), and process
+           * the other two in parallel. One multiply less.
+           */
+          s1 = s & 0xff00ff;
+          s1 = (s1 >> 16) | (s1 << 16);
+          s &= 0xff00;
+
+          if (alpha == SDL_ALPHA_OPAQUE) {
+              *dstp = 0xff000000 | s | s1;
+          } else {
+            d = *dstp;
+            dalpha = d >> 24;
+            d1 = d & 0xff00ff;
+            d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
+            d &= 0xff00;
+            d = (d + ((s - d) * alpha >> 8)) & 0xff00;
+            dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
+            *dstp = d1 | d | (dalpha << 24);
+          }
+        }
+        ++srcp;
+        ++dstp;
+        }, width);
+        /* *INDENT-ON* */
+        srcp += srcskip;
+        dstp += dstskip;
+    }
+}
+
 #ifdef __3dNOW__
 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
 static void
@@ -1407,6 +1462,12 @@ SDL_CalculateBlitA(SDL_Surface * surface)
 #endif
                     return BlitRGBtoRGBPixelAlpha;
                 }
+            } else if (sf->Rmask == df->Bmask
+                && sf->Gmask == df->Gmask
+                && sf->Bmask == df->Rmask && sf->BytesPerPixel == 4) {
+                if (sf->Amask == 0xff000000) {
+                    return BlitRGBtoBGRPixelAlpha;
+                }
             }
             return BlitNtoNPixelAlpha;