From 423e160e639d301feb2b4ba220199d112def0164 Mon Sep 17 00:00:00 2001
From: John Cox <jc@kynesim.co.uk>
Date: Tue, 19 Jan 2016 11:43:41 +0000
Subject: [PATCH] H.265 residual decode performance improvements

This patch overhauls ff_hevc_residual_coding in an attempt to improve its
performance espically on the Raspberry Pi2.

To this end there are a number of changes:

1) Implement a more efficient method for extracting cabac bypass bits in
bulk (called by22 in the patch as it decodes 22 bits at once). This takes
some setup and so is only useful if we know we have a lot of bypass bits
to decode at once (as we do in the residual coding).

2) Generate some ARM asm functions for some of the more important loops

3) Simplify the parameters passed to those loops and use precalculated
arrays wherever possible rather than working out constants repeatedly.

Point (3) is needed to enable (2) as otherwise there are too many variables
for the ARM to keep track of easily and it turns out that most of the
performance improvement comes from it.

Signed-off-by: John Cox (jc@kynesim.co.uk)
---
 libavcodec/arm/cabac.h       |  155 +++++-
 libavcodec/arm/hevc_cabac.h  |  490 +++++++++++++++++++
 libavcodec/cabac.c           |   11 +-
 libavcodec/cabac.h           |    9 +-
 libavcodec/cabac_functions.h |   15 +-
 libavcodec/hevc.h            |    3 +
 libavcodec/hevc_cabac.c      | 1094 ++++++++++++++++++++++++++++++++----------
 7 files changed, 1516 insertions(+), 261 deletions(-)
 create mode 100644 libavcodec/arm/hevc_cabac.h

diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
index fdbf86b..0a3980a 100644
--- a/libavcodec/arm/cabac.h
+++ b/libavcodec/arm/cabac.h
@@ -26,13 +26,34 @@
 #include "libavutil/internal.h"
 #include "libavcodec/cabac.h"
 
+
+#if UNCHECKED_BITSTREAM_READER
+#define LOAD_16BITS_BEHI\
+        "ldrh       %[tmp]        , [%[ptr]]    , #2            \n\t"\
+        "rev        %[tmp]        , %[tmp]                      \n\t"
+#elif CONFIG_THUMB
+#define LOAD_16BITS_BEHI\
+        "ldr        %[tmp]        , [%[c], %[end]]              \n\t"\
+        "cmp        %[tmp]        , %[ptr]                      \n\t"\
+        "it         cs                                          \n\t"\
+        "ldrhcs     %[tmp]        , [%[ptr]]    , #2            \n\t"\
+        "rev        %[tmp]        , %[tmp]                      \n\t"
+#else
+#define LOAD_16BITS_BEHI\
+        "ldr        %[tmp]        , [%[c], %[end]]              \n\t"\
+        "cmp        %[tmp]        , %[ptr]                      \n\t"\
+        "ldrcsh     %[tmp]        , [%[ptr]]    , #2            \n\t"\
+        "rev        %[tmp]        , %[tmp]                      \n\t"
+#endif
+
+
 #define get_cabac_inline get_cabac_inline_arm
 static av_always_inline int get_cabac_inline_arm(CABACContext *c,
                                                  uint8_t *const state)
 {
     int bit;
+#if 0
     void *reg_b, *reg_c, *tmp;
-
     __asm__ volatile(
         "ldrb       %[bit]        , [%[state]]                  \n\t"
         "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
@@ -100,9 +121,141 @@ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
           [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
         : "memory", "cc"
         );
+#else
+   // *** Not thumb compatible yet
+   unsigned int reg_b, tmp;
+    __asm__ (
+        "ldrb       %[bit]        , [%[state]]                  \n\t"
+        "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
+        "and        %[tmp]        , %[range]    , #0xC0         \n\t"
+        "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
+        "ldrb       %[tmp]        , [%[r_b]     , %[tmp], lsl #1] \n\t"
+// %bit = *state
+// %range = range
+// %tmp = RangeLPS
+        "sub        %[range]      , %[range]    , %[tmp]        \n\t"
+
+        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
+        "ittt       ge                                          \n\t"
+        "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+        "mvnge      %[bit]        , %[bit]                      \n\t"
+        "movge      %[range]      , %[tmp]                      \n\t"
+
+        "clz        %[tmp]        , %[range]                    \n\t"
+        "sub        %[tmp]        , #23                         \n\t"
+
+        "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
+        "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
+        "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
+
+        "strb       %[r_b]        , [%[state]]                  \n\t"
+        "lsls       %[tmp]        , %[low]      , #16           \n\t"
+
+        "bne        2f                                          \n\t"
+        LOAD_16BITS_BEHI
+        "lsr        %[tmp]        , %[tmp]      , #15           \n\t"
+        "movw       %[r_b]        , #0xFFFF                     \n\t"
+        "sub        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+
+        "rbit       %[r_b]        , %[low]                      \n\t"
+        "clz        %[r_b]        , %[r_b]                      \n\t"
+        "sub        %[r_b]        , %[r_b]      , #16           \n\t"
+#if CONFIG_THUMB
+        "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+        "add        %[low]        , %[low]      , %[tmp]        \n\t"
+#else
+        "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
+#endif
+        "2:                                                     \n\t"
+        :    [bit]"=&r"(bit),
+             [low]"+&r"(c->low),
+           [range]"+&r"(c->range),
+             [r_b]"=&r"(reg_b),
+             [ptr]"+&r"(c->bytestream),
+             [tmp]"=&r"(tmp)
+          :  [state]"r"(state),
+            [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+              [byte]"M"(offsetof(CABACContext, bytestream)),
+#if !UNCHECKED_BITSTREAM_READER
+                 [c]"r"(c),
+               [end]"M"(offsetof(CABACContext, bytestream_end)),
+#endif
+           [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+        : "memory", "cc"
+        );
+#endif
 
     return bit & 1;
 }
+
+#define get_cabac_bypass get_cabac_bypass_arm
+static inline int get_cabac_bypass_arm(CABACContext * const c)
+{
+    int rv = 0;
+    unsigned int tmp;
+    __asm (
+        "lsl        %[low]        , #1                          \n\t"
+        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
+        "adc        %[rv]         , %[rv]       , #0            \n\t"
+        "it         cs                                          \n\t"
+        "subcs      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+        "lsls       %[tmp]        , %[low]      , #16           \n\t"
+        "bne        1f                                          \n\t"
+        LOAD_16BITS_BEHI
+        "add        %[low]        , %[low]      , %[tmp], lsr #15 \n\t"
+        "movw       %[tmp]        , #0xFFFF                     \n\t"
+        "sub        %[low]        , %[low]      , %[tmp]        \n\t"
+        "1:                                                     \n\t"
+        : // Outputs
+              [rv]"+&r"(rv),
+             [low]"+&r"(c->low),
+             [tmp]"=&r"(tmp),
+             [ptr]"+&r"(c->bytestream)
+        : // Inputs
+#if !UNCHECKED_BITSTREAM_READER
+                 [c]"r"(c),
+               [end]"M"(offsetof(CABACContext, bytestream_end)),
+#endif
+             [range]"r"(c->range)
+        : "cc"
+    );
+    return rv;
+}
+
+
+#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
+static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
+{
+    unsigned int tmp;
+    __asm (
+        "lsl        %[low]        , #1                          \n\t"
+        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
+        "ite        cc                                          \n\t"
+        "rsbcc      %[rv]         , %[rv]       , #0            \n\t"
+        "subcs      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+        "lsls       %[tmp]        , %[low]      , #16           \n\t"
+        "bne        1f                                          \n\t"
+        LOAD_16BITS_BEHI
+        "add        %[low]        , %[low]      , %[tmp], lsr #15 \n\t"
+        "movw       %[tmp]        , #0xFFFF                     \n\t"
+        "sub        %[low]        , %[low]      , %[tmp]        \n\t"
+        "1:                                                     \n\t"
+        : // Outputs
+              [rv]"+&r"(rv),
+             [low]"+&r"(c->low),
+             [tmp]"=&r"(tmp),
+             [ptr]"+&r"(c->bytestream)
+        : // Inputs
+#if !UNCHECKED_BITSTREAM_READER
+                 [c]"r"(c),
+               [end]"M"(offsetof(CABACContext, bytestream_end)),
+#endif
+             [range]"r"(c->range)
+        : "cc"
+    );
+    return rv;
+}
+
 #endif /* HAVE_ARMV6T2_INLINE */
 
 #endif /* AVCODEC_ARM_CABAC_H */
diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h
new file mode 100644
index 0000000..3d1680c
--- /dev/null
+++ b/libavcodec/arm/hevc_cabac.h
@@ -0,0 +1,490 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_HEVC_CABAC_H
+#define AVCODEC_ARM_HEVC_CABAC_H
+
+#include "config.h"
+#if HAVE_ARMV6T2_INLINE
+
+#define hevc_mem_bits32 hevc_mem_bits32_arm
+static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
+{
+    unsigned int n;
+    __asm__ (
+        "rev        %[n], %[x]                     \n\t"
+        : [n]"=r"(n)
+        : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
+        :
+        );
+    return n << (bits & 7);
+}
+
+
+// ---------------------------------------------------------------------------
+//
+// Helper fns - little bits of code where ARM has an instraction that the
+// compiler doesn't know about / use
+
+#define trans_scale_sat trans_scale_sat_arm
+static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+{
+    int rv;
+    int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
+
+    __asm__ (
+    "ssat %[rv], #16, %[t], ASR #1 \n\t"
+    : [rv]"=r"(rv)
+    : [t]"r"(t)
+    :
+    );
+    return rv;
+}
+
+#define update_rice update_rice_arm
+static inline void update_rice_arm(uint8_t * const stat_coeff,
+    const unsigned int last_coeff_abs_level_remaining,
+    const unsigned int c_rice_param)
+{
+    int t;
+    __asm__ (
+    "movs  %[t], %[coeff], LSR %[shift]     \n\t"
+    "it    eq                               \n\t"
+    "subeq %[stat], %[stat], #1             \n\t"
+    "cmp   %[t], #2                         \n\t"
+    "adc   %[stat], %[stat], #0            \n\t"
+    "usat  %[stat], #8, %[stat]            \n\t"
+    : [stat]"+&r"(*stat_coeff),
+          [t]"=r"(t)
+    :  [coeff]"r"(last_coeff_abs_level_remaining),
+       [shift]"r"(c_rice_param)
+    : "cc"
+    );
+}
+
+// ---------------------------------------------------------------------------
+//
+// CABAC get loops
+//
+// Where the loop is simple enough we can normally do 10-30% better than the
+// compiler
+
+// Get the residual greater than 1 bits
+
+#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
+static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
+    uint8_t * const state0)
+{
+    unsigned int i, reg_b, st, tmp, bit, rv;
+     __asm__ (
+         "mov        %[i]          , #0                          \n\t"
+         "mov        %[rv]         , #0                          \n\t"
+         "1:                                                     \n\t"
+         "add        %[i]          , %[i]        , #1            \n\t"
+         "cmp        %[rv]         , #0                          \n\t"
+         "ite        eq                                          \n\t"
+         "usateq     %[st]         , #2          , %[i]          \n\t"
+         "movne      %[st]         , #0                          \n\t"
+
+         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
+         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
+         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
+         "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
+         "ldrb       %[tmp]        , [%[r_b], %[tmp], lsl #1]    \n\t"
+         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
+
+         "cmp        %[low]        , %[range], lsl #17           \n\t"
+         "ittt       ge                                          \n\t"
+         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+         "mvnge      %[bit]        , %[bit]                      \n\t"
+         "movge      %[range]      , %[tmp]                      \n\t"
+
+         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
+         "and        %[bit]        , %[bit]      , #1            \n\t"
+         "orr        %[rv]         , %[bit]      , %[rv], lsl #1 \n\t"
+
+         "clz        %[tmp]        , %[range]                    \n\t"
+         "sub        %[tmp]        , #23                         \n\t"
+
+         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
+         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
+
+         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
+// There is a small speed gain from combining both conditions, using a single
+// branch and then working out what that meant later
+         "lsls       %[tmp]        , %[low]      , #16           \n\t"
+         "it         ne                                          \n\t"
+         "cmpne      %[n]          , %[i]                        \n\t"
+         "bne        1b                                          \n\t"
+
+// If reload is not required then we must have run out of flags to decode
+         "tst        %[tmp]        , %[tmp]                      \n\t"
+         "bne        2f                                          \n\t"
+
+// Do reload
+         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
+         "movw       %[r_b]        , #0xFFFF                     \n\t"
+         "rev        %[tmp]        , %[tmp]                      \n\t"
+         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
+
+         "rbit       %[r_b]        , %[low]                      \n\t"
+         "clz        %[r_b]        , %[r_b]                      \n\t"
+         "sub        %[r_b]        , %[r_b]      , #16           \n\t"
+
+#if CONFIG_THUMB
+         "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+         "add        %[low]        , %[low]      , %[tmp]        \n\t"
+#else
+         "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
+#endif
+
+         "cmp        %[n]          , %[i]                        \n\t"
+         "bne        1b                                          \n\t"
+         "2:                                                     \n\t"
+         :    [bit]"=&r"(bit),
+              [low]"+&r"(c->low),
+            [range]"+&r"(c->range),
+              [r_b]"=&r"(reg_b),
+             [bptr]"+&r"(c->bytestream),
+                [i]"=&r"(i),
+              [tmp]"=&r"(tmp),
+               [st]"=&r"(st),
+               [rv]"=&r"(rv)
+          :  [state0]"r"(state0),
+                  [n]"r"(n),
+        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+               [byte]"M"(offsetof(CABACContext, bytestream)),
+            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+         : "memory", "cc"
+    );
+    return rv;
+}
+
+
+// n must be > 0 on entry
+#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
+static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
+    unsigned int n,
+    const uint8_t const * ctx_map,
+    uint8_t * p)
+{
+    unsigned int reg_b, tmp, st, bit;
+     __asm__ (
+         "1:                                                     \n\t"
+// Get bin from map
+         "ldrb       %[st]         , [%[ctx_map], %[n]]          \n\t"
+
+// Load state & ranges
+         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
+         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
+         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
+         "add        %[r_b]        , %[r_b]      , %[tmp], lsl #1 \n\t"
+         "ldrb       %[tmp]        , [%[r_b], %[bit]]            \n\t"
+         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
+
+         "cmp        %[low]        , %[range], lsl #17           \n\t"
+         "ittt       ge                                          \n\t"
+         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
+         "mvnge      %[bit]        , %[bit]                      \n\t"
+         "movge      %[range]      , %[tmp]                      \n\t"
+
+         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
+         "tst        %[bit]        , #1                          \n\t"
+// GCC asm seems to need strbne written differently for thumb and arm
+#if CONFIG_THUMB
+         "it         ne                                          \n\t"
+         "strbne     %[n]          , [%[idx]]    , #1            \n\t"
+#else
+         "strneb     %[n]          , [%[idx]]    , #1            \n\t"
+#endif
+
+// Renorm
+         "clz        %[tmp]        , %[range]                    \n\t"
+         "sub        %[tmp]        , #23                         \n\t"
+         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
+         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
+
+         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
+// There is a small speed gain from combining both conditions, using a single
+// branch and then working out what that meant later
+         "subs       %[n]          , %[n]        , #1            \n\t"
+#if CONFIG_THUMB
+         "itt        ne                                          \n\t"
+         "lslsne     %[tmp]        , %[low]      , #16           \n\t"
+         "bne        1b                                          \n\t"
+#else
+         "lslnes     %[tmp]        , %[low]      , #16           \n\t"
+         "bne        1b                                          \n\t"
+#endif
+
+// If we have bits left then n must be 0 so give up now
+         "lsls       %[tmp]        , %[low]      , #16           \n\t"
+         "bne        2f                                          \n\t"
+
+// Do reload
+         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
+         "movw       %[r_b]        , #0xFFFF                     \n\t"
+         "rev        %[tmp]        , %[tmp]                      \n\t"
+         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
+
+         "rbit       %[r_b]        , %[low]                      \n\t"
+         "clz        %[r_b]        , %[r_b]                      \n\t"
+         "sub        %[r_b]        , %[r_b]      , #16           \n\t"
+
+#if CONFIG_THUMB
+         "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+         "add        %[low]        , %[low]      , %[tmp]        \n\t"
+#else
+         "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
+#endif
+
+// Check to see if we still have more to do
+         "cmp        %[n]          , #0                          \n\t"
+         "bne        1b                                          \n\t"
+         "2:                                                     \n\t"
+         :    [bit]"=&r"(bit),
+              [low]"+&r"(c->low),
+            [range]"+&r"(c->range),
+              [r_b]"=&r"(reg_b),
+             [bptr]"+&r"(c->bytestream),
+              [idx]"+&r"(p),
+                [n]"+&r"(n),
+              [tmp]"=&r"(tmp),
+               [st]"=&r"(st)
+          :  [state0]"r"(state0),
+            [ctx_map]"r"(ctx_map),
+        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
+               [byte]"M"(offsetof(CABACContext, bytestream)),
+            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
+         : "memory", "cc"
+    );
+
+    return p;
+}
+
+// ---------------------------------------------------------------------------
+//
+// CABAC_BY22 functions
+//
+// By and large these are (at best) no faster than their C equivalents - the
+// only one worth having is _peek where we do a slightly better job than the
+// compiler
+//
+// The others have been stashed here for reference in case larger scale asm
+// is attempted in which case they might be a useful base
+
+
+#define get_cabac_by22_peek get_cabac_by22_peek_arm
+static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
+{
+    uint32_t rv, tmp;
+    __asm__ (
+        "bic      %[rv]  , %[low], #1            \n\t"
+        "cmp      %[inv] , #0                    \n\t"
+        "it       ne                             \n\t"
+        "umullne  %[tmp] , %[rv] , %[inv], %[rv] \n\t"
+        :  // Outputs
+             [rv]"=&r"(rv),
+             [tmp]"=r"(tmp)
+        :  // Inputs
+             [low]"r"(c->low),
+             [inv]"r"(c->range)
+        :  // Clobbers
+                "cc"
+    );
+    return rv << 1;
+}
+
+#if 0
+
+// ***** Slower than the C  :-(
+#define get_cabac_by22_flush get_cabac_by22_flush_arm
+static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, const uint32_t val)
+{
+    uint32_t m, tmp;
+    __asm__ (
+    "add    %[bits], %[bits], %[n]   \n\t"
+    "ldr    %[m], [%[ptr], %[bits], lsr #3]  \n\t"
+
+    "rsb    %[tmp], %[n], #32        \n\t"
+    "lsr    %[tmp], %[val], %[tmp]   \n\t"
+    "mul    %[tmp], %[range], %[tmp] \n\t"
+
+    "rev    %[m], %[m]               \n\t"
+
+    "lsl    %[tmp], %[tmp], #23      \n\t"
+    "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
+
+    "and    %[tmp], %[bits], #7         \n\t"
+    "lsl    %[m], %[m], %[tmp]          \n\t"
+
+    "orr    %[low], %[low], %[m], lsr #9      \n\t"
+        :  // Outputs
+             [m]"=&r"(m),
+           [tmp]"=&r"(tmp),
+          [bits]"+&r"(c->by22.bits),
+           [low]"+&r"(c->low)
+        :  // Inputs
+               [n]"r"(n),
+             [val]"r"(val),
+             [inv]"r"(c->range),
+           [range]"r"(c->by22.range),
+             [ptr]"r"(c->bytestream)
+        :  // Clobbers
+    );
+}
+
+
+// Works but slower than C
+#define coeff_abs_level_remaining_decode_by22(c,r) coeff_abs_level_remaining_decode_by22_arm(c, r)
+static int coeff_abs_level_remaining_decode_by22_arm(CABACContext * const c, const unsigned int c_rice_param)
+{
+    uint32_t n, val, tmp, level;
+
+//    PROFILE_START();
+
+    __asm__ (
+            // Peek
+            "bic    %[val],  %[low],   #1  \n\t"
+            "cmp    %[inv], #0          \n\t"
+            "umullne  %[tmp], %[val], %[inv], %[val] \n\t"
+            "lsl    %[val], %[val], #1  \n\t"
+
+            // Count bits (n = prefix)
+            "mvn    %[n], %[val] \n\t"
+            "clz    %[n], %[n]   \n\t"
+
+            "lsl    %[level], %[val], %[n] \n\t"
+            "subs   %[tmp], %[n], #3 \n\t"
+            "blo    2f \n\t"
+
+            // prefix >= 3
+            // < tmp = prefix - 3
+            // > tmp = prefix + rice - 3
+            "add    %[tmp], %[tmp], %[rice] \n\t"
+            // > n = prefix * 2 + rice - 3
+            "add    %[n], %[tmp], %[n] \n\t"
+            "cmp    %[n], #21 \n\t"
+            "bhi    3f \n\t"
+
+            "orr    %[level], %[level], #0x80000000 \n\t"
+            "rsb    %[tmp], %[tmp], #31 \n\t"
+            "lsr    %[level], %[level], %[tmp] \n\t"
+
+            "mov    %[tmp], #2 \n\t"
+            "add    %[level], %[level], %[tmp], lsl %[rice] \n\t"
+            "b      1f \n\t"
+
+            // > 22 bits used in total - need reload
+            "3:  \n\t"
+
+            // Stash prefix + rice - 3 in level (only spare reg)
+            "mov    %[level], %[tmp] \n\t"
+            // Restore n to flush value (prefix)
+            "sub    %[n], %[n], %[tmp] \n\t"
+
+            // Flush + reload
+
+//          "rsb    %[tmp], %[n], #32        \n\t"
+//          "lsr    %[tmp], %[val], %[tmp]   \n\t"
+//          "mul    %[tmp], %[range], %[tmp] \n\t"
+
+            // As it happens we know that all the bits we are flushing are 1
+            // so we can cheat slightly
+            "rsb    %[tmp], %[range], %[range], lsl %[n] \n\t"
+            "lsl    %[tmp], %[tmp], #23      \n\t"
+            "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
+
+            "add    %[bits], %[bits], %[n]   \n\t"
+            "ldr    %[n], [%[ptr], %[bits], lsr #3]  \n\t"
+            "rev    %[n], %[n]               \n\t"
+            "and    %[tmp], %[bits], #7         \n\t"
+            "lsl    %[n], %[n], %[tmp]          \n\t"
+
+            "orr    %[low], %[low], %[n], lsr #9      \n\t"
+
+            // (reload)
+
+            "bic    %[val],  %[low],   #1  \n\t"
+            "cmp    %[inv], #0          \n\t"
+            "umullne  %[tmp], %[val], %[inv], %[val] \n\t"
+            "lsl    %[val], %[val], #1  \n\t"
+
+            // Build value
+
+            "mov    %[n], %[level] \n\t"
+
+            "orr     %[tmp], %[val], #0x80000000 \n\t"
+            "rsb     %[level], %[level], #31 \n\t"
+            "lsr     %[level], %[tmp], %[level] \n\t"
+
+            "mov    %[tmp], #2 \n\t"
+            "add    %[level], %[level], %[tmp], lsl %[rice] \n\t"
+            "b      1f \n\t"
+
+            // prefix < 3
+            "2:  \n\t"
+            "rsb    %[tmp], %[rice], #31 \n\t"
+            "lsr    %[level], %[level], %[tmp] \n\t"
+            "orr    %[level], %[level], %[n], lsl %[rice] \n\t"
+            "add    %[n], %[n], %[rice] \n\t"
+
+            "1:  \n\t"
+            // Flush
+            "add    %[n], %[n], #1 \n\t"
+
+            "rsb    %[tmp], %[n], #32        \n\t"
+            "lsr    %[tmp], %[val], %[tmp]   \n\t"
+
+            "add    %[bits], %[bits], %[n]   \n\t"
+            "ldr    %[val], [%[ptr], %[bits], lsr #3]  \n\t"
+
+            "mul    %[tmp], %[range], %[tmp] \n\t"
+            "lsl    %[tmp], %[tmp], #23      \n\t"
+            "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
+
+            "rev    %[val], %[val]               \n\t"
+            "and    %[tmp], %[bits], #7         \n\t"
+            "lsl    %[val], %[val], %[tmp]          \n\t"
+
+            "orr    %[low], %[low], %[val], lsr #9      \n\t"
+        :  // Outputs
+         [level]"=&r"(level),
+             [n]"=&r"(n),
+           [val]"=&r"(val),
+           [tmp]"=&r"(tmp),
+          [bits]"+&r"(c->by22.bits),
+           [low]"+&r"(c->low)
+        :  // Inputs
+            [rice]"r"(c_rice_param),
+             [inv]"r"(c->range),
+           [range]"r"(c->by22.range),
+             [ptr]"r"(c->bytestream)
+        :  // Clobbers
+                "cc"
+    );
+
+//    PROFILE_ACC(residual_abs);
+
+    return level;
+}
+#endif
+
+#endif /* HAVE_ARMV6T2_INLINE */
+
+#endif /* AVCODEC_ARM_HEVC_CABAC_H */
diff --git a/libavcodec/cabac.c b/libavcodec/cabac.c
index 5bf5bc2..f2c239d 100644
--- a/libavcodec/cabac.c
+++ b/libavcodec/cabac.c
@@ -183,10 +183,19 @@ int ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size){
 #if CABAC_BITS == 16
     c->low =  (*c->bytestream++)<<18;
     c->low+=  (*c->bytestream++)<<10;
+    // Keep our fetches on a 2-byte boundry as this should avoid ever having to
+    // do unaligned loads if the compiler (or asm) optimises the double byte
+    // load into a single instruction
+    if(((uintptr_t)c->bytestream & 1) == 0) {
+        c->low += (1 << 9);
+    }
+    else {
+        c->low += ((*c->bytestream++) << 2) + 2;
+    }
 #else
     c->low =  (*c->bytestream++)<<10;
-#endif
     c->low+= ((*c->bytestream++)<<2) + 2;
+#endif
     c->range= 0x1FE;
     if ((c->range<<(CABAC_BITS+1)) < c->low)
         return AVERROR_INVALIDDATA;
diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
index 1bf1c62..ccfa991 100644
--- a/libavcodec/cabac.h
+++ b/libavcodec/cabac.h
@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
 typedef struct CABACContext{
     int low;
     int range;
-    int outstanding_count;
+    union
+    {
+        int outstanding_count;
+        struct {
+            uint16_t bits;
+            uint16_t range;
+        } by22;
+    };
     const uint8_t *bytestream_start;
     const uint8_t *bytestream;
     const uint8_t *bytestream_end;
diff --git a/libavcodec/cabac_functions.h b/libavcodec/cabac_functions.h
index 31c919b..cbb186b 100644
--- a/libavcodec/cabac_functions.h
+++ b/libavcodec/cabac_functions.h
@@ -51,6 +51,7 @@ static const uint8_t * const ff_h264_lps_range = ff_h264_cabac_tables + H264_LPS
 static const uint8_t * const ff_h264_mlps_state = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET;
 static const uint8_t * const ff_h264_last_coeff_flag_offset_8x8 = ff_h264_cabac_tables + H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET;
 
+#if !defined(get_cabac_bypass) || !defined(get_cabac_terminate)
 static void refill(CABACContext *c){
 #if CABAC_BITS == 16
         c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
@@ -63,7 +64,9 @@ static void refill(CABACContext *c){
 #endif
         c->bytestream += CABAC_BITS / 8;
 }
+#endif
 
+#ifndef get_cabac_terminate
 static inline void renorm_cabac_decoder_once(CABACContext *c){
     int shift= (uint32_t)(c->range - 0x100)>>31;
     c->range<<= shift;
@@ -71,14 +74,18 @@ static inline void renorm_cabac_decoder_once(CABACContext *c){
     if(!(c->low & CABAC_MASK))
         refill(c);
 }
+#endif
 
 #ifndef get_cabac_inline
 static void refill2(CABACContext *c){
     int i;
     unsigned x;
-
+#if !HAVE_FAST_CLZ
     x= c->low ^ (c->low-1);
     i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)];
+#else
+    i = ff_ctz(c->low) - CABAC_BITS;
+#endif
 
     x= -CABAC_MASK;
 
@@ -94,7 +101,9 @@ static void refill2(CABACContext *c){
 #endif
         c->bytestream += CABAC_BITS/8;
 }
+#endif
 
+#ifndef get_cabac_inline
 static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){
     int s = *state;
     int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s];
@@ -166,6 +175,7 @@ static av_always_inline int get_cabac_bypass_sign(CABACContext *c, int val){
  *
  * @return the number of bytes read or 0 if no end
  */
+#ifndef get_cabac_terminate
 static int av_unused get_cabac_terminate(CABACContext *c){
     c->range -= 2;
     if(c->low < c->range<<(CABAC_BITS+1)){
@@ -175,11 +185,13 @@ static int av_unused get_cabac_terminate(CABACContext *c){
         return c->bytestream - c->bytestream_start;
     }
 }
+#endif
 
 /**
  * Skip @p n bytes and reset the decoder.
  * @return the address of the first skipped byte or NULL if there's less than @p n bytes left
  */
+#ifndef skip_bytes
 static av_unused const uint8_t* skip_bytes(CABACContext *c, int n) {
     const uint8_t *ptr = c->bytestream;
 
@@ -196,5 +208,6 @@ static av_unused const uint8_t* skip_bytes(CABACContext *c, int n) {
 
     return ptr;
 }
+#endif
 
 #endif /* AVCODEC_CABAC_FUNCTIONS_H */
diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
index 9d72555..9d0d383 100644
--- a/libavcodec/hevc.h
+++ b/libavcodec/hevc.h
@@ -23,6 +23,9 @@
 #ifndef AVCODEC_HEVC_H
 #define AVCODEC_HEVC_H
 
+// define RPI to split the CABAC/prediction/transform into separate stages
+#include "config.h"
+
 #include "libavutil/buffer.h"
 #include "libavutil/md5.h"
 
diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
index d1bef83..f13ca65 100644
--- a/libavcodec/hevc_cabac.c
+++ b/libavcodec/hevc_cabac.c
@@ -21,14 +21,72 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#define UNCHECKED_BITSTREAM_READER 1
+
 #include "libavutil/attributes.h"
 #include "libavutil/common.h"
 
-#include "cabac_functions.h"
 #include "hevc.h"
+#include "cabac_functions.h"
+
+// BY22 is probably faster than simple bypass if the processor has
+// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
+// x86 has fast int divide
+// Arm doesn't have divide or general fast 64 bit, but does have the multiply
+// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
+#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
+// Use native divide if we have a fast one - otherwise use mpy 1/x
+// x86 has a fast integer divide - arm doesn't - unsure about other
+// architectures
+#define USE_BY22_DIV  ARCH_X86
+
+// Special case blocks with a single significant ceoff
+// Decreases the complexity of the code for a common case but increases the
+// code size.
+#define USE_N_END_1 1
+
+#if ARCH_ARM
+#include "arm/hevc_cabac.h"
+#endif
 
 #define CABAC_MAX_BIN 31
 
+
+#if USE_BY22 && !USE_BY22_DIV
+#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
+
+static const uint32_t cabac_by22_inv_range[256] = {
+                                                    0,      I(257), I(258), I(259),
+    I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
+    I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
+    I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
+    I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
+    I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
+    I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
+    I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
+    I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
+    I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
+    I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
+    I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
+    I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
+    I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
+    I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
+    I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
+    I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
+    I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
+    I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
+    I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
+    I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
+    I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
+    I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
+    I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
+    I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
+    I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
+    I(510), I(511)
+};
+#undef I
+#endif  // USE_BY22
+
 /**
  * number of bin by SyntaxElement.
  */
@@ -445,6 +503,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
     { 28, 36, 43, 49, 54, 58, 61, 63, },
 };
 
+
+typedef struct
+{
+    uint16_t coeff;
+    uint16_t scale;
+} xy_off_t;
+
+#define XYT_C(x,y,t) ((x) + ((y) << (t)))
+#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
+#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
+#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
+
+#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
+
+#define OFF_DIAG(t) {\
+    XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
+    XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
+    XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
+    XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
+}
+
+#define OFF_HORIZ(t) {\
+    XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
+    XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
+    XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
+    XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
+}
+
+#define OFF_VERT(t) {\
+    XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
+    XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
+    XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
+    XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
+}
+
+static const xy_off_t off_xys[3][4][16] =
+{
+    {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
+    {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
+    {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
+};
+
+
+// Helper fns
+#ifndef hevc_mem_bits32
+static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
+{
+    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
+}
+#endif
+
+#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
+#define hevc_clz32 hevc_clz32_builtin
+static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
+{
+    // __builtin_clz says it works on ints - so adjust if int is >32 bits long
+    return __builtin_clz(x) - (sizeof(int) * 8 - 32);
+}
+#endif
+
+// It is unlikely that we will ever need this but include for completeness
+#ifndef hevc_clz32
+static inline unsigned int hevc_clz32(unsigned int x)
+{
+    unsigned int n = 1;
+    if ((x & 0xffff0000) == 0) {
+        n += 16;
+        x <<= 16;
+    }
+    if ((x & 0xff000000) == 0) {
+        n += 8;
+        x <<= 8;
+    }
+    if ((x & 0xf0000000) == 0) {
+        n += 4;
+        x <<= 4;
+    }
+    if ((x & 0xc0000000) == 0) {
+        n += 2;
+        x <<= 2;
+    }
+    return n - ((x >> 31) & 1);
+}
+#endif
+
+
+#if !USE_BY22
+// If no by22 then _by22 functions will revert to normal and so _peek/_flush
+// will no longer be called but the setup calls will still exist and we want
+// to null them out
+#define bypass_start(s)
+#define bypass_finish(s)
+#else
+// Use BY22 for residual bypass block
+
+#define bypass_start(s) get_cabac_by22_start(&s->HEVClc->cc)
+#define bypass_finish(s) get_cabac_by22_finish(&s->HEVClc->cc)
+
+// BY22 notes that bypass is simply a divide into the bitstream and so we
+// can peek out large quantities of bits at one and treat the result as if
+// it was VLC.  In many cases this will lead to O(1) processing rather than
+// O(n) though the setup and teardown is sufficiently expensive that it is
+// only worth using if we expect to be dealing with more than a few bits
+// The definition of "a few bits" will vary from platform to platform but
+// tests on ARM show that it probably isn't worth it for a single coded
+// residual, but is for >1 - this is probaly reinforced that if there are
+// more residuals then they are likely to be bigger and this will make the
+// O(1) nature of the code more worthwhile.
+
+
+#if !USE_BY22_DIV
+// * 1/x @ 32 bits gets us 22 bits of accuracy
+#define CABAC_BY22_PEEK_BITS  22
+#else
+// A real 32-bit divide gets us another bit
+// If we have a 64 bit int & a unit time divider then we should get a lot
+// of bits (55)  but that is untested and it is unclear if it would give
+// us a large advantage
+#define CABAC_BY22_PEEK_BITS  23
+#endif
+
+// Bypass block start
+// Must be called before _by22_peek is used as it sets the CABAC environment
+// into the correct state.  _by22_finish must be called to return to 'normal'
+// (i.e. non-bypass) cabac decoding
+static inline void get_cabac_by22_start(CABACContext * const c)
+{
+    const unsigned int bits = __builtin_ctz(c->low);
+    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
+    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
+#if !USE_BY22_DIV
+    const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
+#endif
+
+    c->bytestream -= (CABAC_BITS / 8);
+    c->by22.bits = bits;
+#if !USE_BY22_DIV
+    c->by22.range = c->range;
+    c->range = inv;
+#endif
+    c->low = x;
+}
+
+// Bypass block finish
+// Must be called at the end of the bypass block to return to normal operation
+static inline void get_cabac_by22_finish(CABACContext * const c)
+{
+    unsigned int used = c->by22.bits;
+    unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
+    unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
+
+    c->bytestream += bytes_used + (CABAC_BITS / 8);
+    c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
+#if !USE_BY22_DIV
+    c->range = c->by22.range;
+#endif
+}
+
+// Peek bypass bits
+// _by22_start must be called before _by22_peek is called and _by22_flush
+// must be called afterwards to flush any used bits
+// The actual number of valid bits returned is
+// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
+// will be at least 22 which should be long enough for any prefix or suffix
+// though probably not long enough for the worst case combination
+#ifndef get_cabac_by22_peek
+static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
+{
+#if USE_BY22_DIV
+    return ((unsigned int)c->low / (unsigned int)c->range) << 9;
+#else
+    uint32_t x = c->low & ~1U;
+    const uint32_t inv = c->range;
+
+    if (inv != 0)
+        x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
+
+    return x << 1;
+#endif
+}
+#endif
+
+// Flush bypass bits peeked by _by22_peek
+// Flush n bypass bits. n must be >= 1 to guarantee correct operation
+// val is an unmodified copy of whatever _by22_peek returned
+#ifndef get_cabac_by22_flush
+static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
+{
+    // Subtract the bits used & reshift up to the top of the word
+#if USE_BY22_DIV
+    const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
+#else
+    const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
+#endif
+
+    // and refill lower bits
+    // We will probably OR over some existing bits but that doesn't matter
+    c->by22.bits += n;
+    c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
+}
+#endif
+
+#endif  // USE_BY22
+
+
 void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
 {
     if (s->ps.pps->entropy_coding_sync_enabled_flag &&
@@ -863,19 +1126,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
     return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
 }
 
-static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
+static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx_nz)
 {
-    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
+    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
 }
 
-static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
+static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx_nz)
 {
-    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
+    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
 }
 
-static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
+static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx_nz)
 {
-    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
+    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
 }
 
 int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
@@ -891,14 +1154,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
     return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
 }
 
-static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx_nz,
                                                    int log2_size, int *last_scx_prefix, int *last_scy_prefix)
 {
     int i = 0;
     int max = (log2_size << 1) - 1;
     int ctx_offset, ctx_shift;
 
-    if (!c_idx) {
+    if (!c_idx_nz) {
         ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
         ctx_shift = (log2_size + 1) >> 2;
     } else {
@@ -929,22 +1192,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
     return value;
 }
 
-static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
+static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx_nz, int ctx_cg)
 {
     int inc;
 
-    inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0);
+    inc = (ctx_cg != 0) + (c_idx_nz << 1);
 
     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
 }
-static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c,
-                                           int offset, const uint8_t *ctx_idx_map)
-{
-    int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
-    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
-}
 
-static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
+static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int offset)
 {
     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
 }
@@ -966,65 +1223,305 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
     return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
 }
 
-static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
+
+#if !USE_BY22
+#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
+#endif
+
+
+#ifndef coeff_abs_level_remaining_decode_bypass
+static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
+{
+    CABACContext * const c = &s->HEVClc->cc;
+    uint32_t y;
+    unsigned int prefix;
+    unsigned int last_coeff_abs_level_remaining;
+    unsigned int n;
+
+    y = get_cabac_by22_peek(c);
+    prefix = hevc_clz32(~y);
+    // y << prefix will always have top bit 0
+
+    if (prefix < 3) {
+        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
+        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
+        n = prefix + 1 + rice_param;
+    }
+    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
+    {
+        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
+
+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+        n = prefix * 2 + rice_param - 2;
+    }
+    else {
+        unsigned int suffix;
+
+        get_cabac_by22_flush(c, prefix, y);
+        y = get_cabac_by22_peek(c);
+
+        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+        n = prefix + rice_param - 2;
+    }
+
+    get_cabac_by22_flush(c, n, y);
+
+    return last_coeff_abs_level_remaining;
+}
+#endif
+
+static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
 {
+    CABACContext * const c = &s->HEVClc->cc;
     int prefix = 0;
     int suffix = 0;
     int last_coeff_abs_level_remaining;
     int i;
 
-    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
+    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
         prefix++;
     if (prefix == CABAC_MAX_BIN) {
         av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
         return 0;
     }
+
     if (prefix < 3) {
         for (i = 0; i < rc_rice_param; i++)
-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+            suffix = (suffix << 1) | get_cabac_bypass(c);
         last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
     } else {
         int prefix_minus3 = prefix - 3;
         for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+            suffix = (suffix << 1) | get_cabac_bypass(c);
         last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
                                               << rc_rice_param) + suffix;
     }
+
     return last_coeff_abs_level_remaining;
 }
 
-static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
+#if !USE_BY22
+#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
+static inline uint32_t coeff_sign_flag_decode(HEVCContext * const s, const unsigned int nb)
 {
-    int i;
-    int ret = 0;
+    CABACContext * const c = &s->HEVClc->cc;
+    unsigned int i;
+    uint32_t ret = 0;
 
     for (i = 0; i < nb; i++)
-        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
-    return ret;
+        ret = (ret << 1) | get_cabac_bypass(c);
+
+    return ret << (32 - nb);
+}
+#endif
+
+#ifndef coeff_sign_flag_decode_bypass
+static inline uint32_t coeff_sign_flag_decode_bypass(HEVCContext * const s, const unsigned int nb)
+{
+    CABACContext * const c = &s->HEVClc->cc;
+    uint32_t y;
+    y = get_cabac_by22_peek(c);
+    get_cabac_by22_flush(c, nb, y);
+    return y & ~(0xffffffffU >> nb);
+}
+#endif
+
+
+#ifndef get_cabac_greater1_bits
+static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
+    uint8_t * const state0)
+{
+    unsigned int i;
+    unsigned int rv = 0;
+    for (i = 0; i != n; ++i) {
+        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
+        const unsigned int b = get_cabac(c, state0 + idx);
+        rv = (rv << 1) | b;
+    }
+    return rv;
 }
+#endif
+
+
+// N.B. levels returned are the values assuming coeff_abs_level_remaining
+// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
+// this version of events.
+static inline uint32_t get_greaterx_bits(HEVCContext * const s, const unsigned int n_end, int * const levels,
+    int * const pprev_subset_coded, int * const psum,
+    const unsigned int idx0_gt1, const unsigned int idx_gt2)
+{
+    CABACContext * const c = &s->HEVClc->cc;
+    uint8_t * const state0 = s->HEVClc->cabac_state + idx0_gt1;
+    uint8_t * const state_gt2 = s->HEVClc->cabac_state + idx_gt2;
+    unsigned int rv;
+    unsigned int i;
+    const unsigned int n = FFMIN(n_end, 8);
+
+    // Really this is i != n but the simple unconditional loop is cheaper
+    // and faster
+    for (i = 0; i != 8; ++i)
+        levels[i] = 1;
+
+    rv = get_cabac_greater1_bits(c, n, state0);
+
+    *pprev_subset_coded = 0;
+    *psum = n;
+
+    rv <<= (32 - n);
+    if (rv != 0)
+    {
+        *pprev_subset_coded = 1;
+        *psum = n + 1;
+        i = hevc_clz32(rv);
+        levels[i] = 2;
+        if (get_cabac(c, state_gt2) == 0)
+        {
+            // Unset first coded bit
+            rv &= ~(0x80000000U >> i);
+        }
+    }
+
+    if (n_end > 8) {
+        const unsigned int g8 = n_end - 8;
+        rv |= ((1 << g8) - 1) << (24 - g8);
+        for (i = 0; i != g8; ++i) {
+            levels[i + 8] = 0;
+        }
+    }
+
+    return rv;
+}
+
+// extended_precision_processing_flag must be false given we are
+// putting the result into a 16-bit array
+// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
+// scale_m is uint8_t
+//
+// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
+//   or it can be 2 (if we have transquant_bypass)
+// shift is set to one less than we really want but would normally be
+//   s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
+// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
+// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
+// to achieve it
+
+#ifndef trans_scale_sat
+static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+{
+    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
+}
+#endif
+
+
+#ifndef update_rice
+static inline void update_rice(uint8_t * const stat_coeff,
+    const unsigned int last_coeff_abs_level_remaining,
+    const unsigned int c_rice_param)
+{
+    const unsigned int x = last_coeff_abs_level_remaining >> c_rice_param;
+    if (x >= 3)
+        (*stat_coeff)++;
+    else if (x == 0 && *stat_coeff > 0)
+        (*stat_coeff)--;
+}
+#endif
+
+
+// n must be > 0 on entry
+#ifndef get_cabac_sig_coeff_flag_idxs
+static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+    unsigned int n,
+    const uint8_t const * ctx_map,
+    uint8_t * p)
+{
+    do {
+        if (get_cabac(c, state0 + ctx_map[n]))
+            *p++ = n;
+    } while (--n != 0);
+    return p;
+}
+#endif
+
+
+static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+    unsigned int n,
+    const uint8_t const * ctx_map,
+    uint8_t * const flag_idx)
+{
+    int rv;
+
+    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
+
+    return rv;
+}
+
+#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+     x0,  x1,  x2,  x3,\
+     x4,  x5,  x6,  x7,\
+     x8,  x9, x10, x11,\
+    x12, x13, x14, x15}
+
+#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+     x0,  x4,  x8, x12,\
+     x1,  x5,  x9, x13,\
+     x2,  x6, x10, x14,\
+     x3,  x7, x11, x15}
+
+#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+     x0,  x4,  x1,  x8,\
+     x5,  x2, x12,  x9,\
+     x6,  x3, x13, x10,\
+     x7, x14, x11, x15}
+
+
+static inline int next_subset(HEVCContext * const s, int i, const int c_idx_nz,
+    uint8_t * const significant_coeff_group_flag,
+    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
+    int * const pPrev_sig)
+{
+    while (--i >= 0) {
+        unsigned int x_cg = scan_x_cg[i];
+        unsigned int y_cg = scan_y_cg[i];
+
+        // For the flag decode we only care about Z/NZ but
+        // we use the full Right + Down * 2 when calculating
+        // significant coeff flags so we obtain it here
+        //.
+        // The group flag array is one longer than it needs to
+        // be so we don't need to check for y_cg limits
+        unsigned int prev_sig = ((significant_coeff_group_flag[y_cg] >> (x_cg + 1)) & 1) |
+            (((significant_coeff_group_flag[y_cg + 1] >> x_cg) & 1) << 1);
+
+        if (i == 0 ||
+            significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
+        {
+            significant_coeff_group_flag[y_cg] |= (1 << x_cg);
+            *pPrev_sig = prev_sig;
+            break;
+        }
+    }
+
+    return i;
+}
+
 
 void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                                 int log2_trafo_size, enum ScanType scan_idx,
                                 int c_idx)
 {
-#define GET_COORD(offset, n)                                    \
-    do {                                                        \
-        x_c = (x_cg << 2) + scan_x_off[n];                      \
-        y_c = (y_cg << 2) + scan_y_off[n];                      \
-    } while (0)
-    HEVCLocalContext *lc = s->HEVClc;
-    int transform_skip_flag = 0;
+    HEVCLocalContext * const lc = s->HEVClc;
+    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
 
     int last_significant_coeff_x, last_significant_coeff_y;
-    int last_scan_pos;
-    int n_end;
     int num_coeff = 0;
-    int greater1_ctx = 1;
+    int prev_subset_coded = 0;
 
     int num_last_subset;
     int x_cg_last_sig, y_cg_last_sig;
 
-    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
+    const uint8_t *scan_x_cg, *scan_y_cg;
+    const xy_off_t * scan_xy_off;
 
     ptrdiff_t stride = s->frame->linesize[c_idx];
     int hshift = s->ps.sps->hshift[c_idx];
@@ -1032,24 +1529,29 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
-    uint8_t significant_coeff_group_flag[8][8] = {{0}};
+    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
     int explicit_rdpcm_flag = 0;
     int explicit_rdpcm_dir_flag;
 
     int trafo_size = 1 << log2_trafo_size;
     int i;
-    int qp,shift,add,scale,scale_m;
-    const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
+    int qp,shift,scale;
+    static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
     const uint8_t *scale_matrix = NULL;
     uint8_t dc_scale;
     int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
                                          lc->tu.intra_pred_mode_c;
 
+    int prev_sig = 0;
+    const int c_idx_nz = (c_idx != 0);
+
+    int may_hide_sign;
+
     memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
 
     // Derive QP for dequant
     if (!lc->cu.cu_transquant_bypass_flag) {
-        static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
+        static const uint8_t qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
         static const uint8_t rem6[51 + 4 * 6 + 1] = {
             0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
             3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
@@ -1065,9 +1567,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
         };
         int qp_y = lc->qp_y;
 
+        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
+
         if (s->ps.pps->transform_skip_enabled_flag &&
             log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
-            transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
+            int transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx_nz);
+            if (transform_skip_flag) {
+                trans_skip_or_bypass = 1;
+                if (lc->cu.pred_mode ==  MODE_INTRA  &&
+                    s->ps.sps->implicit_rdpcm_enabled_flag &&
+                    (pred_mode_intra == 10 || pred_mode_intra == 26)) {
+                    may_hide_sign = 0;
+                }
+            }
         }
 
         if (c_idx == 0) {
@@ -1100,39 +1612,73 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
             qp += s->ps.sps->qp_bd_offset;
         }
 
-        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
-        add      = 1 << (shift-1);
-        scale    = level_scale[rem6[qp]] << (div6[qp]);
-        scale_m  = 16; // default when no custom scaling lists.
-        dc_scale = 16;
+        // Shift is set to one less than will actually occur as the scale
+        // and saturate step adds 1 and then shifts right again
+        shift = s->ps.sps->bit_depth + log2_trafo_size - 6;
+        scale = level_scale[rem6[qp]];
+        if (div6[qp] >= shift) {
+            scale <<= (div6[qp] - shift);
+            shift = 0;
+        } else {
+            shift -= div6[qp];
+        }
 
-        if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
             const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
-            &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
+                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
             int matrix_id = lc->cu.pred_mode != MODE_INTRA;
 
             matrix_id = 3 * matrix_id + c_idx;
 
             scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
+            dc_scale = scale_matrix[0];
             if (log2_trafo_size >= 4)
                 dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
         }
+        else
+        {
+            static const uint8_t sixteen_scale[64] = {
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16
+            };
+            scale_matrix = sixteen_scale;
+            dc_scale = 16;
+        }
     } else {
+        static const uint8_t unit_scale[64] = {
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1,
+        };
+        scale_matrix = unit_scale;
         shift        = 0;
-        add          = 0;
-        scale        = 0;
-        dc_scale     = 0;
+        scale        = 2;  // We will shift right to kill this
+        dc_scale     = 1;
+
+        may_hide_sign = 0;
     }
 
     if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
-        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
-        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
+        trans_skip_or_bypass) {
+        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx_nz);
         if (explicit_rdpcm_flag) {
-            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
+            may_hide_sign = 0;
+            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx_nz);
         }
     }
 
-    last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
+    last_significant_coeff_xy_prefix_decode(s, c_idx_nz, log2_trafo_size,
                                            &last_significant_coeff_x, &last_significant_coeff_y);
 
     if (last_significant_coeff_x > 3) {
@@ -1160,119 +1706,113 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
         int last_x_c = last_significant_coeff_x & 3;
         int last_y_c = last_significant_coeff_y & 3;
 
-        scan_x_off = ff_hevc_diag_scan4x4_x;
-        scan_y_off = ff_hevc_diag_scan4x4_y;
         num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
-        if (trafo_size == 4) {
+
+        switch (log2_trafo_size) {
+        case 2:
             scan_x_cg = scan_1x1;
             scan_y_cg = scan_1x1;
-        } else if (trafo_size == 8) {
+            break;
+        case 3:
             num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
             scan_x_cg = diag_scan2x2_x;
             scan_y_cg = diag_scan2x2_y;
-        } else if (trafo_size == 16) {
+            break;
+        case 4:
             num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
             scan_x_cg = ff_hevc_diag_scan4x4_x;
             scan_y_cg = ff_hevc_diag_scan4x4_y;
-        } else { // trafo_size == 32
+            break;
+        case 5:
+        default:
             num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
             scan_x_cg = ff_hevc_diag_scan8x8_x;
             scan_y_cg = ff_hevc_diag_scan8x8_y;
+            break;
         }
         break;
     }
     case SCAN_HORIZ:
         scan_x_cg = horiz_scan2x2_x;
         scan_y_cg = horiz_scan2x2_y;
-        scan_x_off = horiz_scan4x4_x;
-        scan_y_off = horiz_scan4x4_y;
         num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
         break;
     default: //SCAN_VERT
         scan_x_cg = horiz_scan2x2_y;
         scan_y_cg = horiz_scan2x2_x;
-        scan_x_off = horiz_scan4x4_y;
-        scan_y_off = horiz_scan4x4_x;
         num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
         break;
     }
     num_coeff++;
     num_last_subset = (num_coeff - 1) >> 4;
 
-    for (i = num_last_subset; i >= 0; i--) {
-        int n, m;
-        int x_cg, y_cg, x_c, y_c, pos;
-        int implicit_non_zero_coeff = 0;
-        int64_t trans_coeff_level;
-        int prev_sig = 0;
-        int offset = i << 4;
-        int rice_init = 0;
-
-        uint8_t significant_coeff_flag_idx[16];
-        uint8_t nb_significant_coeff_flag = 0;
+    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
 
-        x_cg = scan_x_cg[i];
-        y_cg = scan_y_cg[i];
+    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
 
-        if ((i < num_last_subset) && (i > 0)) {
-            int ctx_cg = 0;
-            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
-                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
-            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
-                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
-
-            significant_coeff_group_flag[x_cg][y_cg] =
-                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
-            implicit_non_zero_coeff = 1;
-        } else {
-            significant_coeff_group_flag[x_cg][y_cg] =
-            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
-             (x_cg == 0 && y_cg == 0));
-        }
+    i = num_last_subset;
+    do {
+        int implicit_non_zero_coeff = 0;
+        int n_end;
 
-        last_scan_pos = num_coeff - offset - 1;
+        uint8_t significant_coeff_flag_idx[16];
+        unsigned int nb_significant_coeff_flag = 0;
 
         if (i == num_last_subset) {
+            // First time through
+            int last_scan_pos = num_coeff - (i << 4) - 1;
             n_end = last_scan_pos - 1;
             significant_coeff_flag_idx[0] = last_scan_pos;
             nb_significant_coeff_flag = 1;
         } else {
             n_end = 15;
+            implicit_non_zero_coeff = (i != 0);
         }
 
-        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
-            prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
-        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
-            prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
-
-        if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
-            static const uint8_t ctx_idx_map[] = {
-                0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
-                1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
-                2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
-                2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
-                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2  // default
+        if (n_end >= 0) {
+            static const uint8_t ctx_idx_maps_ts2[3][16] = {
+                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
+            };
+            static const uint8_t ctx_idx_maps[3][4][16] = {
+                {
+                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+                },
+                {
+                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+                },
+                {
+                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+                }
             };
             const uint8_t *ctx_idx_map_p;
             int scf_offset = 0;
-            if (s->ps.sps->transform_skip_context_enabled_flag &&
-                (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
-                ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
-                if (c_idx == 0) {
-                    scf_offset = 40;
-                } else {
-                    scf_offset = 14 + 27;
-                }
+
+            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+                ctx_idx_map_p = ctx_idx_maps[0][3];
+                scf_offset = 40 + c_idx_nz;
             } else {
-                if (c_idx != 0)
+                if (c_idx_nz != 0)
                     scf_offset = 27;
+
                 if (log2_trafo_size == 2) {
-                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
+                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
                 } else {
-                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
-                    if (c_idx == 0) {
-                        if ((x_cg > 0 || y_cg > 0))
+                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
+                    if (!c_idx_nz) {
+                        if (i != 0)
                             scf_offset += 3;
+
                         if (log2_trafo_size == 3) {
                             scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
                         } else {
@@ -1286,34 +1826,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                     }
                 }
             }
-            for (n = n_end; n > 0; n--) {
-                x_c = scan_x_off[n];
-                y_c = scan_y_off[n];
-                if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
-                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
-                    nb_significant_coeff_flag++;
+
+            if (n_end > 0) {
+                int cnt = get_sig_coeff_flag_idxs(&s->HEVClc->cc,
+                    s->HEVClc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
+                    n_end, ctx_idx_map_p,
+                    significant_coeff_flag_idx + nb_significant_coeff_flag);
+
+                nb_significant_coeff_flag += cnt;
+                if (cnt != 0) {
                     implicit_non_zero_coeff = 0;
                 }
             }
+
             if (implicit_non_zero_coeff == 0) {
-                if (s->ps.sps->transform_skip_context_enabled_flag &&
-                    (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
-                    if (c_idx == 0) {
-                        scf_offset = 42;
-                    } else {
-                        scf_offset = 16 + 27;
-                    }
+                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+                    scf_offset = 42 + c_idx_nz;
                 } else {
                     if (i == 0) {
-                        if (c_idx == 0)
-                            scf_offset = 0;
-                        else
-                            scf_offset = 27;
+                        scf_offset = c_idx_nz ? 27 : 0;
                     } else {
                         scf_offset = 2 + scf_offset;
                     }
                 }
-                if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
+                if (significant_coeff_flag_decode_0(s, scf_offset) == 1) {
                     significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
                     nb_significant_coeff_flag++;
                 }
@@ -1323,141 +1859,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
             }
         }
 
-        n_end = nb_significant_coeff_flag;
-
+        if (nb_significant_coeff_flag != 0) {
+            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
+                ((i != 0 && !c_idx_nz) ? 2 : 0) |
+                prev_subset_coded;
+            const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
+                (gt1_idx_delta << 2);
+            const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
+                gt1_idx_delta;
+
+            const unsigned int x_cg = scan_x_cg[i];
+            const unsigned int y_cg = scan_y_cg[i];
+            int16_t * const blk_coeffs = coeffs +
+                ((x_cg + (y_cg << log2_trafo_size)) << 2);
+            // This calculation is 'wrong' for log2_traffo_size == 2
+            // but that doesn't mattor as in this case x_cg & y_cg
+            // are always 0 so result is correct (0) anyway
+            const uint8_t * const blk_scale = scale_matrix +
+                (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
+
+            // * THe following code block doesn't deal with these flags:
+            //   (nor did the one it replaces)
+            //
+            // cabac_bypass_alignment_enabled_flag
+            //    This should be easy but I can't find a test case
+            // extended_precision_processing_flag
+            //    This can extend the required precision past 16bits
+            //    so is probably tricky - also no example found yet
+
+#if USE_N_END_1
+            if (nb_significant_coeff_flag == 1) {
+                // There is a small gain to be had from special casing the single
+                // transform coefficient case.  The reduction in complexity
+                // makes up for the code duplicatioon.
+
+                int trans_coeff_level = 1;
+                int coeff_sign_flag;
+                int coded_val = 0;
+
+                // initialize first elem of coeff_bas_level_greater1_flag
+                prev_subset_coded = 0;
+
+                if (get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx0_gt1 + 1)) {
+                    trans_coeff_level = 2;
+                    prev_subset_coded = 1;
+                    coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
+                }
 
-        if (n_end) {
-            int first_nz_pos_in_cg;
-            int last_nz_pos_in_cg;
-            int c_rice_param = 0;
-            int first_greater1_coeff_idx = -1;
-            uint8_t coeff_abs_level_greater1_flag[8];
-            uint16_t coeff_sign_flag;
-            int sum_abs = 0;
-            int sign_hidden;
-            int sb_type;
+                // Probably not worth the overhead of starting by22 for just one value
+                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
 
+                if (coded_val)
+                {
+                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+                        trans_coeff_level = 3 + coeff_abs_level_remaining_decode(s, 0);
+                    } else {
+                        uint8_t * const stat_coeff =
+                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+                        const unsigned int c_rice_param = *stat_coeff >> 2;
+                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
 
-            // initialize first elem of coeff_bas_level_greater1_flag
-            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
+                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
+                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+                    }
+                }
 
-            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
-                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
-                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
-                else
-                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
-                c_rice_param = lc->stat_coeff[sb_type] / 4;
-            }
+                {
+                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
+                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
+                    const unsigned int scale_m = blk_scale[xy_off->scale];
 
-            if (!(i == num_last_subset) && greater1_ctx == 0)
-                ctx_set++;
-            greater1_ctx = 1;
-            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
-
-            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
-                int inc = (ctx_set << 2) + greater1_ctx;
-                coeff_abs_level_greater1_flag[m] =
-                    coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
-                if (coeff_abs_level_greater1_flag[m]) {
-                    greater1_ctx = 0;
-                    if (first_greater1_coeff_idx == -1)
-                        first_greater1_coeff_idx = m;
-                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
-                    greater1_ctx++;
+                    blk_coeffs[xy_off->coeff] = trans_scale_sat(
+                        (trans_coeff_level ^ k) - k,  // Apply sign
+                        scale,
+                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
+                        shift);
                 }
             }
-            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
-
-            if (lc->cu.cu_transquant_bypass_flag ||
-                (lc->cu.pred_mode ==  MODE_INTRA  &&
-                 s->ps.sps->implicit_rdpcm_enabled_flag  &&  transform_skip_flag  &&
-                 (pred_mode_intra == 10 || pred_mode_intra  ==  26 )) ||
-                 explicit_rdpcm_flag)
-                sign_hidden = 0;
             else
-                sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
+#endif
+            {
+                int sign_hidden = may_hide_sign;
+                int levels[16]; // Should be able to get away with int16_t but that fails some tests
+                uint32_t coeff_sign_flags;
+                uint32_t coded_vals = 0;
+                // Sum(abs(level[]))
+                // In fact we only need the bottom bit and in some future
+                // version that may be all we calculate
+                unsigned int sum_abs;
+
+                coded_vals = get_greaterx_bits(s, nb_significant_coeff_flag, levels,
+                    &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
+
+                if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
+                    sign_hidden = 0;
+
+                // -- Start bypass block
+
+                bypass_start(s);
+
+                coeff_sign_flags = coeff_sign_flag_decode_bypass(s, nb_significant_coeff_flag - sign_hidden);
+
+                if (coded_vals != 0)
+                {
+                    const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
+                    uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
+                        lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+                    int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
+                    int * level = levels - 1;
+
+                    do {
+                        {
+                            const unsigned int z = hevc_clz32(coded_vals) + 1;
+                            level += z;
+                            coded_vals <<= z;
+                        }
 
-            if (first_greater1_coeff_idx != -1) {
-                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
-            }
-            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
-                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
-            } else {
-                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
-            }
+                        {
+                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param);
+                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
 
-            for (m = 0; m < n_end; m++) {
-                n = significant_coeff_flag_idx[m];
-                GET_COORD(offset, n);
-                if (m < 8) {
-                    trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
-                    if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
-                        int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
-
-                        trans_coeff_level += last_coeff_abs_level_remaining;
-                        if (trans_coeff_level > (3 << c_rice_param))
-                            c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
-                        if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
-                            int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
-                            if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
-                                lc->stat_coeff[sb_type]++;
-                            else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
-                                if (lc->stat_coeff[sb_type] > 0)
-                                    lc->stat_coeff[sb_type]--;
-                            rice_init = 1;
+                            sum_abs += last_coeff_abs_level_remaining + 1;
+                            *level = trans_coeff_level;
+
+                            if (stat_coeff != NULL)
+                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+                            stat_coeff = NULL;
+
+                            if (trans_coeff_level > (3 << c_rice_param) &&
+                                (c_rice_param < 4 || rice_adaptation_enabled))
+                                ++c_rice_param;
                         }
-                    }
-                } else {
-                    int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
-
-                    trans_coeff_level = 1 + last_coeff_abs_level_remaining;
-                    if (trans_coeff_level > (3 << c_rice_param))
-                        c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
-                    if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
-                        int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
-                        if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
-                            lc->stat_coeff[sb_type]++;
-                        else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
-                            if (lc->stat_coeff[sb_type] > 0)
-                                lc->stat_coeff[sb_type]--;
-                        rice_init = 1;
-                    }
+                    } while (coded_vals != 0);
                 }
-                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
-                    sum_abs += trans_coeff_level;
-                    if (n == first_nz_pos_in_cg && (sum_abs&1))
-                        trans_coeff_level = -trans_coeff_level;
+
+                // sign_hidden = 0 or 1 so we can combine the tests
+                if ((sign_hidden & sum_abs) != 0) {
+                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
                 }
-                if (coeff_sign_flag >> 15)
-                    trans_coeff_level = -trans_coeff_level;
-                coeff_sign_flag <<= 1;
-                if(!lc->cu.cu_transquant_bypass_flag) {
-                    if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
-                        if(y_c || x_c || log2_trafo_size < 4) {
-                            switch(log2_trafo_size) {
-                                case 3: pos = (y_c << 3) + x_c; break;
-                                case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
-                                case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
-                                default: pos = (y_c << 2) + x_c; break;
-                            }
-                            scale_m = scale_matrix[pos];
-                        } else {
-                            scale_m = dc_scale;
-                        }
+
+                bypass_finish(s);
+
+                // -- Finish bypass block
+
+                // Scale loop
+                {
+                    int m = nb_significant_coeff_flag - 1;
+
+                    // Deal with DC component (if any) first
+                    if (i == 0 && significant_coeff_flag_idx[m] == 0)
+                    {
+                        const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+                        blk_coeffs[0] = trans_scale_sat(
+                            (levels[m] ^ k) - k, scale, dc_scale, shift);
+                        --m;
                     }
-                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
-                    if(trans_coeff_level < 0) {
-                        if((~trans_coeff_level) & 0xFffffffffff8000)
-                            trans_coeff_level = -32768;
-                    } else {
-                        if(trans_coeff_level & 0xffffffffffff8000)
-                            trans_coeff_level = 32767;
+
+#if !USE_N_END_1
+                    // If N_END_! set then m was at least 1 initially
+                    if (m >= 0)
+#endif
+                    {
+                        do {
+                            const xy_off_t * const xy_off = scan_xy_off +
+                                significant_coeff_flag_idx[m];
+                            const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+
+                            blk_coeffs[xy_off->coeff] = trans_scale_sat(
+                                (levels[m] ^ k) - k,
+                                scale,
+                                blk_scale[xy_off->scale],
+                                shift);
+                        } while (--m >= 0);
                     }
                 }
-                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
+
             }
         }
-    }
+    } while ((i = next_subset(s, i, c_idx_nz,
+        significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0);
 
     if (lc->cu.cu_transquant_bypass_flag) {
         if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
@@ -1467,7 +2047,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
             s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
         }
     } else {
-        if (transform_skip_flag) {
+        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
             int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
                       log2_trafo_size == 2 &&
                       lc->cu.pred_mode == MODE_INTRA;
-- 
1.9.1